# Data Lake for Data Song Analysis

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

input_data = config['AWS']['INPUT_DATA']
#log_data = config['LOCAL']['INPUT_LOG_DATA']
output_data = config['AWS']['OUTPUT_DATA']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

## Process song data

In [4]:
# get filepath to log data file
song_data = 'song_data/A/A/A/*.json'

# read log data file
df = spark.read.json(f"{input_data}{song_data}")

In [5]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [6]:
# extract columns to create songs table
songs_table = df.select(['song_id','title','artist_id','year','duration'])

In [7]:
# write songs table to parquet files partitioned by year and artist
songs_table.write.mode('overwrite').partitionBy('year','artist_id').parquet(output_data + 'songs')

In [8]:
# extract columns to create artists table
artists_table = df.select(['artist_id','artist_name','artist_location',\
                           'artist_latitude','artist_longitude'])

In [9]:
# write artists table to parquet files
artists_table.write.mode("overwrite").parquet(output_data + 'artists')

## Process log data

In [10]:
# get filepath to log data file
log_data = "log_data/2018/11/*.json"

In [11]:
# read log data file
df = spark.read.json(f"{input_data}{log_data}")

In [12]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [13]:
# filter by actions for song plays
df = df.where('page = "NextSong"')

In [14]:
df.createOrReplaceTempView('log_schema')

In [15]:
# extract columns for users table  
users_table = df.select(['userId','firstName','lastName','gender','level'])

In [16]:
# write users table to parquet files
users_table.write.mode('overwrite').parquet(output_data + 'users')

In [17]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda ts: str(int(ts/1000.0)))
df = df.withColumn('timestamp',get_timestamp(df.ts))

In [18]:
# create datetime column from original timestamp column
get_datetime = udf(lambda ts: str(datetime.fromtimestamp(int(ts)/1000.0)))
df = df.withColumn("datetime", get_datetime(df.ts))

In [19]:
# extract columns to create time table
time_table = df.select(
col('datetime').alias('start_time'),
hour('datetime').alias('hour'),
dayofmonth('datetime').alias('day'),
weekofyear('datetime').alias('week'),
month('datetime').alias('month'),
year('datetime').alias('year')
).dropDuplicates(['start_time'])

In [None]:
 # write time table to parquet files partitioned by year and month
time_table.write.mode('overwrite').partitionBy('year','month').parquet(output_data + 'time')

In [None]:
 # read in song data to use for songplays table
song_data = input_data + "song-data/A/A/A/*.json"
song_df = spark.read.json(song_data)
songplays_df = df.join(song_df, (song_df.title == df.song))

In [None]:
songplays_df.columns

In [None]:
# extract columns from joined song and log datasets to create songplays table 
songplays_table = songplays_df.select(
col('ts').alias('start_time'),
col('userId').alias('user_id'),
col('level').alias('level'),
col('song_id').alias('song_id'),
col('artist_id').alias('artist_id'),
col('sessionId').alias('session_id'),
col('artist_location').alias('location'),
col('userAgent').alias('user_agent'),
year('datetime').alias('year'),
month('datetime').alias('month')
)

In [None]:
songplays_table = songplays_table.withColumn('songplay_id', monotonically_increasing_id())

In [None]:
# write songplays table to parquet files partitioned by year and month
songplays_table.write.mode('overwrite').partitionBy('year','month').parquet(output_data + 'songplays')