# Reading input from S3 with Apache Spark

- Loading s3 libraries

In [None]:
%set_env PYSPARK_SUBMIT_ARGS=--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell

- Importing libraries and setting up SparkSession

In [None]:
import pyspark
import os
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext

spark = SparkSession.builder.master("local[*]").getOrCreate()

- Setting aws credentials and configurations 

In [None]:
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEYS = os.getenv('AWS_SECRET_ACCESS_KEY')

In [None]:
hadoopConf=spark.sparkContext._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.access.key", AWS_ACCESS_KEY_ID)
hadoopConf.set("fs.s3a.secret.key", AWS_SECRET_ACCESS_KEYS)

- Setting bucket and creating dataframe from csv file

In [None]:
bucket_path = "s3a://path/to/csv"

In [None]:
csvDf = spark.read.csv(bucket_path)

- Playing around with data 

In [None]:
csvDf.schema

In [None]:
csvDf.show(3)

In [None]:
csvDf.createOrReplaceTempView("temptable")

In [None]:
sql_df = spark.sql("SELECT * FROM temptable")

In [None]:
sql_df.show()

In [None]:
filename = "all_rows.sql"
fd = open(filename, 'r')
sqlFile = fd.read()
fd.close()


In [None]:
sql_df_from_file = spark.sql(sqlFile)

In [None]:
sql_df_from_file.show()