# Data Lake for Data Song Analysis

In [2]:
import configparser
from datetime import datetime
import argparse
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [8]:
parser = argparse.ArgumentParser(description='Data lake set up')

In [9]:
parser.add_argument('--key', action='store', dest = 'AWS_ACCESS_KEY_ID',
                    required = True, help = 'AWS Access Key ID of the IAM user')
parser.add_argument('--secret', action='store', dest = 'AWS_SECRET_ACCESS_KEY',
                    required = True, help = 'AWS Secret Access Key of the IAM user')
parser.add_argument('-from', action='store', dest = 'INPUT_BUCKET',
                    required = False, default = 's3a://udacity-dend/', help = 'Path of the bucket to read files from.')
parser.add_argument('-to', action='store', dest = 'OUTPUT_BUCKET',
                    required = True, help = 'Path of the bucket to write final tables.')

_StoreAction(option_strings=['-to'], dest='OUTPUT_BUCKET', nargs=None, const=None, default=None, type=None, choices=None, help='Path of the bucket to write final tables.', metavar=None)

In [5]:
arguments = parser.parse_args()

usage: ipykernel_launcher.py [-h] --key AWS_ACCESS_KEY_ID --secret
                             AWS_SECRET_ACCESS_KEY [-from INPUT_BUCKET] -to
                             OUTPUT_BUCKET
ipykernel_launcher.py: error: the following arguments are required: --key, --secret, -to


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

input_data = config['AWS']['INPUT_DATA']
output_data = config['AWS']['OUTPUT_DATA']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

## Process song data

In [7]:
# get filepath to log data file
song_data = 'song_data/A/B/C/TRABCEI128F424C983.json'

# read log data file
df = spark.read.json(f"{input_data}{song_data}")

In [None]:
df.printSchema()

In [None]:
# extract columns to create songs table
songs_table = df.select(['song_id','title','artist_id','year','duration']).dropDuplicates(['song_id'])

In [None]:
# write songs table to parquet files partitioned by year and artist
songs_table.write.mode('overwrite').partitionBy('year','artist_id').parquet(output_data + 'songs')

In [None]:
# extract columns to create artists table
artists_table = df.select(['artist_id','artist_name','artist_location',\
                           'artist_latitude','artist_longitude']).dropDuplicates(['artist_id'])

In [None]:
# write artists table to parquet files
artists_table.write.mode("overwrite").parquet(output_data + 'artists')

## Process log data

In [6]:
# get filepath to log data file
log_data = "log_data/2018/11/2018-11-13-events.json"

In [None]:
# read log data file
df = spark.read.json(f"{input_data}{log_data}")

In [None]:
df.printSchema()

In [None]:
# filter by actions for song plays
df = df.where('page = "NextSong"')

In [None]:
df.createOrReplaceTempView('log_schema')

In [None]:
# extract columns for users table  
users_table = df.select(['userId','firstName','lastName','gender','level']).dropDuplicates(['userId'])

In [None]:
# write users table to parquet files
users_table.write.mode('overwrite').parquet(output_data + 'users')

In [None]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda ts: str(int(ts/1000.0)))
df = df.withColumn('timestamp',get_timestamp(df.ts))

In [None]:
# create datetime column from original timestamp column
get_datetime = udf(lambda ts: str(datetime.fromtimestamp(int(ts)/1000.0)))
df = df.withColumn("datetime", get_datetime(df.ts))

# create datetime column from original timestamp column
get_datetime = udf(lambda ts: str(datetime.fromtimestamp(int(ts)/1000.0)))
df = df.withColumn("datetime", get_datetime(df.ts))

In [None]:
# extract columns to create time table
time_table = df.select(
col('datetime').alias('start_time'),
hour('datetime').alias('hour'),
dayofmonth('datetime').alias('day'),
weekofyear('datetime').alias('week'),
month('datetime').alias('month'),
year('datetime').alias('year')
).dropDuplicates(['start_time'])

In [None]:
 # write time table to parquet files partitioned by year and month
time_table.write.mode('overwrite').partitionBy('year','month').parquet(output_data + 'time')

___________________________________________________________________

In [None]:
 # read in song data to use for songplays table
song_data = output_data + "songs"
song_df = spark.read.parquet(song_data)
songplays_df = df.join(song_df, (song_df.title == df.song))

In [None]:
songplays_df.columns

In [None]:
# extract columns from joined song and log datasets to create songplays table 
songplays_table = songplays_df.select(
col('ts').alias('start_time'),
col('userId').alias('user_id'),
col('level').alias('level'),
col('song_id').alias('song_id'),
col('artist_id').alias('artist_id'),
col('sessionId').alias('session_id'),
col('artist_location').alias('location'),
col('userAgent').alias('user_agent'),
year('datetime').alias('year'),
month('datetime').alias('month')
)

In [None]:
songplays_table = songplays_table.withColumn('songplay_id', monotonically_increasing_id())

In [None]:
# write songplays table to parquet files partitioned by year and month
songplays_table.write.mode('overwrite').partitionBy('year','month').parquet(output_data + 'songplays')