In [1]:
from pyspark.sql import SparkSession
import os
import configparser
import pyspark.sql.functions as F

config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read_file(open('../dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"] =     config['OAWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"] = config['OAWS']['AWS_SECRET_ACCESS_KEY']
# os.environ["AWS_SESSION_TOKEN"] =     config['AWS']['AWS_SESSION_TOKEN']

In [2]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.3")\
                     .getOrCreate()
sc = spark.sparkContext
config = sc._jsc.hadoopConfiguration()
config.set("fs.s3a.endpoint", "s3.us-west-2.amazonaws.com")

In [3]:
df_events = spark.read.json("s3a://udacity-dend/log_data/2018/11/2018-11-05-events.json")
df_events = df_events = df_events.withColumn("start_time", F.to_timestamp(F.from_unixtime(F.col("ts") / 1000)) )
df_songs = spark.read.json("s3a://udacity-dend/song_data/A/B/Q/TRABQTA128F148D048.json")
df_events.createOrReplaceTempView("staging_events")
df_songs.createOrReplaceTempView("staging_songs")

In [4]:
df_events[["song"]].limit(5).toPandas()

Unnamed: 0,song
0,Almost Lover (Album Version)
1,Serve The Servants
2,See No Evil (Remastered LP Version)
3,Blues To Bechet (LP Version)
4,It's My Job To Keep Punk Rock Elite


In [5]:
df_songs.toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARSEMJF1187FB5641E,,Sweden,,The Hellacopters,230.1122,1,SOJOVBQ12A6D4F96B6,A Heart Without A Home,2000


In [6]:
import functools
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of F.college data with two lists
data = [["ARSEMJF1187FB5641E", "", "", "", \
         "Nirvana", 12, 3, "SOJOVBQ12A6D4F96B4", "It's My Job To Keep Punk Rock Elite", 2014]]
  
# giving F.column names of dataframe
columns = ["artist_id", "artist_latitude", "artist_location", "artist_longitude",\
           "artist_name", "duration", "num_songs", "song_id", "title", "year"]
  
# creating a dataframe
dataframe = spark.createDataFrame(data, columns)

# https://www.geeksforgeeks.org/merge-two-dataframes-in-pyspark/
def unionAll(dfs):
  return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 
df_songs = unionAll([df_songs, dataframe])

In [7]:
df_songs.limit(5).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARSEMJF1187FB5641E,,Sweden,,The Hellacopters,230.1122,1,SOJOVBQ12A6D4F96B6,A Heart Without A Home,2000
1,ARSEMJF1187FB5641E,,,,Nirvana,12.0,3,SOJOVBQ12A6D4F96B4,It's My Job To Keep Punk Rock Elite,2014


In [8]:
df_events.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,start_time
0,A Fine Frenzy,Logged In,Anabelle,F,0,Simpson,267.91138,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044000000.0,256,Almost Lover (Album Version),200,1541377992796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",69,2018-11-05 00:33:12
1,Nirvana,Logged In,Aleena,F,0,Kirby,214.77832,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541023000000.0,237,Serve The Servants,200,1541381242796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,44,2018-11-05 01:27:22
2,Television,Logged In,Aleena,F,1,Kirby,238.49751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541023000000.0,237,See No Evil (Remastered LP Version),200,1541381456796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,44,2018-11-05 01:30:56
3,JOHN COLTRANE,Logged In,Aleena,F,2,Kirby,346.43546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541023000000.0,237,Blues To Bechet (LP Version),200,1541381694796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,44,2018-11-05 01:34:54
4,NOFX,Logged In,Aleena,F,3,Kirby,80.79628,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541023000000.0,237,It's My Job To Keep Punk Rock Elite,200,1541382040796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,44,2018-11-05 01:40:40


In [9]:

songplay_table = df_events.alias("logs")\
    .join(df_songs.alias("songs"), df_events.artist == df_songs.artist_name, 'inner')
songplay_table = songplay_table.withColumnRenamed("firstName","first_name")
songplay_table = songplay_table.withColumnRenamed("lastName","last_name")
songplay_table = songplay_table.withColumnRenamed("datetime","start_time")

In [10]:
songplay_table = songplay_table.withColumnRenamed("userId","user_id")
songplay_table = songplay_table.withColumnRenamed("sessionId","session_id")
songplay_table = songplay_table.withColumnRenamed("artist_location","location")
songplay_table = songplay_table.withColumnRenamed("userAgent","user_agent")
songplay_table = songplay_table.withColumn("songplay_id", F.monotonically_increasing_id())

In [11]:
songplay_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- session_id: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- location: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = t

In [12]:
songplay_table.toPandas()

Unnamed: 0,artist,auth,first_name,gender,itemInSession,last_name,length,level,location,method,...,artist_latitude,location.1,artist_longitude,artist_name,duration,num_songs,song_id,title,year,songplay_id
0,Nirvana,Logged In,Aleena,F,0,Kirby,214.77832,paid,"Waterloo-Cedar Falls, IA",PUT,...,,,,Nirvana,12.0,3,SOJOVBQ12A6D4F96B4,It's My Job To Keep Punk Rock Elite,2014,8589934592


In [13]:
songplay_table = songplay_table.withColumn('year',  F.year(F.col('start_time'))) 
songplay_table = songplay_table.withColumn('month', F.month(F.col('start_time'))) 

In [14]:
songplay_table[["songplay_id"]].toPandas()

Unnamed: 0,songplay_id
0,8589934592


In [15]:
songplay_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- session_id: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- location: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = t

In [16]:
songplay_table[["start_time"]].toPandas()

Unnamed: 0,start_time
0,2018-11-05 01:27:22


In [17]:
songplay_table = songplay_table.withColumn('year', F.year(F.col('start_time'))) 
songplay_table = songplay_table.withColumn('month', F.month(F.col('start_time'))) 

In [18]:
songplay_table[["year"]].toPandas()

Unnamed: 0,year
0,2018


In [19]:
songplay_table[["month"]].toPandas()

Unnamed: 0,month
0,11


In [20]:
songplay_table.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- session_id: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- location: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = t

In [24]:
songplay_table = songplay_table[["songplay_id", "start_time", "user_id", "level", "song_id", "artist_id", "session_id", "logs.location", "user_agent"]]
songplay_table.toPandas()

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,8589934592,2018-11-05 01:27:22,44,paid,SOJOVBQ12A6D4F96B4,ARSEMJF1187FB5641E,237,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...
