In [3]:
from datetime import datetime
import pandas as pd
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from datetime import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [4]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [5]:
# get filepath to song data file
song_data = "data/song_data"

In [6]:
# read song data file
df_song = spark.read.json(song_data + "/*/*/*")
df_song.limit(1).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARDR4AC1187FB371A1,,,,Montserrat Caballé;Placido Domingo;Vicente Sar...,511.16363,1,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,0


In [7]:
# extract columns to create songs table
songs_table = df_song[['song_id', 'title', 'artist_id', 'year', 'duration']]  

In [None]:
# write songs table to parquet files partitioned by year and artist
songs_table.write.partitionBy("year","artist_id").parquet("songs.parquet")

In [29]:
# extract columns to create artists table
artists_table = df_song[['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']]

In [30]:
# write artists table to parquet files
artists_table.write.parquet("artists.parquet")

In [12]:
# get filepath to song data file
log_data = "data/log_data"

In [13]:
# read log data file
df = spark.read.json(log_data)

In [14]:
# filter by actions for song plays

# explanation: song plays needs all 'NextSong' results
df = df.filter(df.page=='NextSong')


In [15]:
# extract columns for users table    
users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']]  

In [16]:
# write users table to parquet files
#users_table.write.parquet("user.parquet")
df.take(1)

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26')]

In [17]:
# create timestamp column from original timestamp column
get_datetime = udf(lambda x: datetime.fromtimestamp( (x/1000.0) ), T.TimestampType()) 
df = df.withColumn("start_time", get_datetime(df.ts))

In [37]:
df.createOrReplaceTempView("log_staging_table")

In [32]:
# create columns for time table
time_table = spark.sql('''
    SELECT start_time,
    EXTRACT(hour from start_time) as hour,
    EXTRACT(day from start_time) as day,
    EXTRACT(week from start_time) as week,
    EXTRACT(month from start_time) as month,
    EXTRACT(year from start_time) as year,
    DAYOFWEEK(start_time) as weekday
    
    from log_staging_table
''').collect()


In [None]:
# write time table to parquet files partitioned by year and month
print(type(users_table))
print(type(time_table))
time_table_dataframe = spark.createDataFrame(time_table)
print(type(time_table_dataframe))
time_table_dataframe.limit(1).toPandas()


time_table_dataframe.write.partitionBy("year","month").parquet("time.parquet")

In [47]:

# read in song data to use for songplays table
song_df =  spark.read.json(song_data + "/*/*/*")
song_df.createOrReplaceTempView("songs_staging_table")

# extract columns from joined song and log datasets to create songplays table 
temp_table = spark.sql('''
    SELECT 
    
        a.start_time, a.userId, a.level, b.song_id,
        b.artist_id, a.sessionId, a.location, a.userAgent,
        EXTRACT(month from a.start_time) as month,
        EXTRACT(year from a.start_time) as year
    from log_staging_table a
    inner join songs_staging_table b on a.song = b.title
''').collect()

songplays_table = spark.createDataFrame(temp_table)
songplays_table.limit(1).toPandas()

Unnamed: 0,start_time,userId,level,song_id,artist_id,sessionId,location,userAgent,month,year
0,2018-11-21 21:56:47.796,15,paid,SOZCTXZ12AB0182364,AR5KOSW1187FB35FF4,818,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",11,2018


In [48]:
# write songplays table to parquet files partitioned by year and month
songplays_table.write.partitionBy("year","month").parquet("songplays.parquet")