In [1]:
import sys
import pandas as pd
from pyspark.sql.functions import round as spark_round

sys.path.append('..')
from etl import create_spark_session
from schemas import song_data_schema, log_data_schema, user_table_schema, song_table_schema, artist_table_schema, songplay_table_schema
from pipelines import basic_pipeline

In [2]:
spark = create_spark_session(local=True)
spark.sparkContext.setLogLevel("ERROR")
spark

In [3]:
df_log_data = spark.read.format('json').schema(log_data_schema).option("recursiveFileLookup",True).load('s3a://udacity-dend/log_data')
df_song_data = spark.read.format('json').schema(song_data_schema).option("recursiveFileLookup",True).load('s3a://udacity-dend/song_data')

df_joined = df_log_data.join(
    df_song_data, 
    on=(df_log_data.song == df_song_data.title) & (spark_round(df_log_data.length, 4) == spark_round(df_song_data.duration, 4)),
    how='LEFT'
)

In [4]:
# dict - get column name from expression
fields_to_rename = {
    'start_time': 'ts',
    'user_id': 'userId',
    'first_name': 'firstName',
    'last_name': 'lastName',
    'name': 'artist_name',
    'location': 'artist_location',
    'latitude': 'artist_latitude',
    'longitude': 'artist_longitude',
    'session_id': 'sessionId',
    'user_agent': 'userAgent'
}

transformations = {
    'start_time': 'to_timestamp(start_time / 1000)', 
    'user_id': 'INT(user_id)',
}

my_basic_pipeline = basic_pipeline(fields_to_rename=fields_to_rename, transformations=transformations)

In [5]:
df_user = my_basic_pipeline((df_log_data, user_table_schema)).distinct()
df_user.count()



107

In [6]:
df_songs = my_basic_pipeline((df_song_data, song_table_schema))
df_songs.count()



14896

In [9]:
df_artists = my_basic_pipeline((df_song_data, artist_table_schema)).distinct()
df_artists.count()



10025

In [12]:
df_songplays = my_basic_pipeline((df_joined, songplay_table_schema))
df_songplays.count()



8056

In [14]:
df_songplays.where("artist_id IS NOT NULL").count()



322

In [15]:
spark.stop()