# ETL
This notebook includes the sequence of the ETL that affects both song and log data.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .getOrCreate()
spark

In [None]:
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import functions as F


In [None]:
song_data_path = os.path.join('data','song_data','*','*','*','*.json')
song_data_path

In [None]:
df_song = spark.read.json(song_data_path)

In [None]:
songs_table = df_song.select(['song_id','title','artist_id','year','duration'])

In [None]:
song_table_path = os.path.join('output','songs.parquet')


songs_table.write.parquet(song_table_path)

## Artist

In [None]:
artists_table = df_song.select(['artist_id','artist_name','artist_location','artist_latitude','artist_longitude'])

In [None]:
artist_table_path = os.path.join('output','artist.parquet')

In [None]:
artists_table.write.parquet(artist_table_path)

# Log data

In [None]:
log_data_path = os.path.join('data','log_data','*.json')
log_data_path

In [None]:
df_log = spark.read.json(log_data_path)

In [None]:
df_log.take(3)

## User

In [None]:
df_log_next_song = df_log.filter(df_log.page == 'NextSong')

In [None]:
df_log_next_song.take(3)

In [None]:
df_users = df_log_next_song.select(['userId','firstName','lastName','gender','level'])

In [None]:
user_table_path = os.path.join('output','users.parquet')
user_table_path

In [None]:
df_users.write.mode('overwrite').parquet(user_table_path)

In [None]:
df_log_next_song.take(3)

In [None]:
df_log_next_song.where(col("ts").isNull()).show()

## Time

In [None]:
get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0))
df_log_next_song = df_log_next_song.withColumn("ts_timestamp",get_timestamp("ts"))

In [None]:
df_log_next_song.take(3)

In [None]:
df_log_next_song = df_log_next_song.withColumn('ts_year',year(df_log_next_song.ts_timestamp))
df_log_next_song = df_log_next_song.withColumn('ts_month',month(df_log_next_song.ts_timestamp))
df_log_next_song = df_log_next_song.withColumn('ts_day',dayofmonth(df_log_next_song.ts_timestamp))
df_log_next_song = df_log_next_song.withColumn('ts_weekofyear',weekofyear(df_log_next_song.ts_timestamp))
df_log_next_song = df_log_next_song.withColumn('ts_hour',hour(df_log_next_song.ts_timestamp))
df_log_next_song = df_log_next_song.withColumn('ts_weekday',weekofyear(df_log_next_song.ts_timestamp))

time_table = df_log_next_song.select(['ts_timestamp','ts_hour','ts_day','ts_weekofyear','ts_month','ts_weekday'])

In [None]:
time_table.take(3)

## Songplay

In [None]:
df_song.createOrReplaceTempView('songs_staging')
df_log_next_song.createOrReplaceTempView('logs_staging')

In [None]:
df_song.printSchema()

In [None]:
df_log_next_song.printSchema()

In [None]:
spark.sql("""
            select 
                logs_staging.registration
            FROM logs_staging as logs_staging
            LEFT JOIN songs_staging as songs_staging
            on logs_staging.artist=songs_staging.artist_name
        """)

In [None]:
songplays_table = spark.sql("""
            select 
                logs_staging.registration,
                logs_staging.userId,
                logs_staging.level,
                songs_staging.song_id,
                songs_staging.artist_id,
                logs_staging.sessionId,
                logs_staging.location,
                logs_staging.userAgent
            from logs_staging
            left join songs_staging
            on logs_staging.artist=songs_staging.artist_name
        """)

In [None]:
songplays_table

In [None]:
songplays_table.write.mode('overwrite').partitionBy("level").parquet('output/songplay.parquet')