# Data Lake with Spark Project

## Setup

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

# get config settings
config = configparser.ConfigParser()
config.read('dl.cfg')

# make environment variables out of ID and secret key
os.environ['AWS_ACCESS_KEY_ID']=config.get('AWS','AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('AWS','AWS_SECRET_ACCESS_KEY')

In [2]:
# initialize Spark session
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [3]:
# Extract .zip files for debugging
# Instructions derived from here: https://thispointer.com/python-how-to-unzip-a-file-extract-single-multiple-or-all-files-from-a-zip-archive/

from zipfile import ZipFile

with ZipFile('./data/song-data.zip', 'r') as zipObj:
    zipObj.extractall('./data')

with ZipFile('./data/log-data.zip', 'r') as zipObj:
    zipObj.extractall('./data')

In [118]:
# define I/O variables
input_data = "s3a://udacity-dend/"
song_input_data = "data/song_data/A/A/A/*.json"
log_input_data = "./data/log_data/*.json"
output_data = 's3a://udacity-dataeng-cseal/data-lake'
# output_data = 'arn:aws:s3:us-east-2:897336544263:accesspoint/udacity-data-eng'

## Process Song File

- Scratchwork for process_song_file function in etl.py

In [6]:
# Infer schema first
song_data = spark.read.json(song_input_data)

In [7]:
song_data.printSchema()
print("Number of files: {}".format(song_data.count()))
song_data.show(5, truncate=False)

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

Number of files: 11
+------------------+---------------+---------------------------+----------------+---------------------+---------+---------+------------------+-------------------------------------------+----+
|artist_id         |artist_latitude|artist_location            |artist_longitude|artist_name          |duration |num_songs|song_id           |title                                      |year|
+------------------+---------------+---------------------------+----------------+---------------------+---------+---------+------------------+---------------------------------

In [17]:
# this looks mostly correct, except we probably can use the 32-bit Integer type for year and num_songs
# let's load the data with a defined schema
from pyspark.sql.types import StructType as ST, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int

song_schema = ST([
    Fld('artist_id', Str(), nullable=True),
    Fld('artist_latitude', Dbl(), nullable=True),
    Fld('artist_location', Str(), nullable=True),
    Fld('artist_longitude', Dbl(), nullable=True),
    Fld('artist_name', Str(), nullable=True),
    Fld('duration', Dbl(), nullable=True),
    Fld('num_songs', Int(), nullable=True),
    Fld('song_id', Str(), nullable=False),
    Fld('title', Str(), nullable=True),
    Fld('year', Int(), nullable=True)
])
song_data = spark.read.json(song_input_data, schema=song_schema)
song_data.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [97]:
# define the song table
songs_table = song_data.select('song_id', 'title', 'artist_id', 'year', 'duration').dropDuplicates()
print(songs_table.limit(5).collect())

[Row(song_id='SONHOTT12A8C13493C', title='Something Girls', artist_id='AR7G5I41187FB4CE6C', year=1982, duration=233.40363), Row(song_id='SOHKNRJ12A6701D1F8', title='Drop of Rain', artist_id='AR10USD1187B99F3F1', year=0, duration=189.57016), Row(song_id='SOCIWDW12A8C13D406', title='Soul Deep', artist_id='ARMJAGH1187FB546F3', year=1969, duration=148.03546), Row(song_id='SOUDSGM12AC9618304', title='Insatiable (Instrumental Version)', artist_id='ARNTLGG11E2835DDB9', year=0, duration=266.39628), Row(song_id='SOQHXMF12AB0182363', title='Young Boy Blues', artist_id='ARGSJW91187B9B1D6B', year=0, duration=218.77506)]


In [119]:
# partition song table by year and artist_id
# print("Years: {}".format(songs_table['year'].unique()[:10]))
# print("Artist IDs: {}".format(songs_table['artist_id'].unique()[:10]))
# partitions = songs_table.groupby(['year','artist_id']).apply(lambda x: x)
# for i, partition in partitions.iterrows():
#     partition_df = songs_table.query("year == {} and artist_id == '{}'".format(partition.year, partition.artist_id))
#     partition_df.to_parquet("{}/songs_table/{}/{}/songs_dim.parquet".format(output_data, year, artist_id))
# print(os.path.join(output_data, 'songs'))
# songs_table.write.partitionBy("year", "artist_id").parquet(os.path.join(output_data, 'songs'))

s3a://udacity-dataeng-cseal/data-lake/songs


Py4JJavaError: An error occurred while calling o892.parquet.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 400, AWS Service: Amazon S3, AWS Request ID: 6543E6783A1446C5, AWS Error Code: null, AWS Error Message: Bad Request, S3 Extended Request ID: 0/7QP+Z77gxLtJBBnAEf4kHsALXD6uBGndRXcJZP1QlpI8pdgN+K3kdcj3GdPn94PIz9BRPmAQ4=
	at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
	at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
	at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1031)
	at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:994)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:297)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWritingFileFormat(DataSource.scala:424)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWriting(DataSource.scala:524)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:566)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## Make artists table

In [116]:
artists_table = song_data.selectExpr(
    ['artist_id', 'artist_name AS name', 'artist_location AS location', 'artist_latitude AS latititude', 'artist_longitude AS longitude']
).dropDuplicates()
print(artists_table.toPandas().head())
# artists_table.write.parquet(os.path.join(output_data, 'artists'))

            artist_id              name         location  latititude  \
0  ARXR32B1187FB57099               Gob                          NaN   
1  ARGSJW91187B9B1D6B      JennyAnyKind   North Carolina    35.21962   
2  ARKRRTF1187B9984DA  Sonora Santanera                          NaN   
3  ARD7TVE1187B99BFB1            Casual  California - LA         NaN   
4  AR8ZCNI1187B9A069B  Planet P Project                          NaN   

   longitude  
0        NaN  
1  -80.01955  
2        NaN  
3        NaN  
4        NaN  
