In [1]:
# import relevant packages
import pyspark 
from pyspark import SparkConf
from pyspark.sql import SparkSession 

In [2]:
# initiate spark
try:
    spark = SparkSession \
        .builder \
        .appName("Datalake_Dev") \
        .getOrCreate() 
    print("spark initiated") 
except Exception as e:
    print("spark failed")
    print(e) 

spark initiated


In [3]:
# collect the context details of this spark session 
spark.sparkContext.getConf().getAll() 

[('spark.rdd.compress', 'True'),
 ('spark.app.name', 'Datalake_Dev'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1622733986611'),
 ('spark.driver.port', '32831'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.host', '2508c848c488')]

In [4]:
# check spark session details 
spark

In [5]:
# unzip log files & song files
import zipfile

In [6]:
with zipfile.ZipFile("data/log-data.zip", 'r') as zipref:
    zipref.extractall("data/logs") 

In [7]:
with zipfile.ZipFile("data/song-data.zip", 'r') as zipref2:
    zipref2.extractall("data/songs") 

### Read test JSON files

Use the \data folder containing test file to run some example work

In [8]:
# set the path to all the files, use multiline option = true, read in all files
test_path_log = "data/logs/*.json" 
user_log = spark.read.option("multiline","true").json(test_path_log) 

In [9]:
# check schema 
user_log.printSchema() 

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [10]:
# check first line of dataframe 
user_log.show(n=1) 

+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+
|  artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|         song|status|           ts|           userAgent|userId|
+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+
|Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+----

#### Import some useful stuff

These may be needed according to etl.py

In [11]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType 

##### Step 1 - Create "time" table from logs data TS

In [12]:
# first - we need to create a timestamp, through a UDF, from the 'ts' column in our dataframe 
get_timestamp = udf(lambda x: int(int(x)//1000)) 
user_log = user_log.withColumn("timestamp", get_timestamp(user_log.ts).cast(IntegerType()))  
user_log.show(n=1) 

+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+----------+
|  artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|         song|status|           ts|           userAgent|userId| timestamp|
+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+----------+
|Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|1542241826|
+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+-----

In [13]:
# look at new schema again 
user_log.printSchema()  

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: integer (nullable = true)



In [14]:
# Second- create UDF that collects a datetime column from the new timestamp column 
get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x)/1000.0)))  

In [15]:
user_log = user_log.withColumn('datetime', get_datetime(user_log.ts)) 
user_log.show(n=1) 

+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+----------+--------------------+
|  artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|         song|status|           ts|           userAgent|userId| timestamp|            datetime|
+--------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+-------------+------+-------------+--------------------+------+----------+--------------------+
|Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|1542241826|2018-11-15 00:30:...|
+--------+---------+---------+------+-------------+-

##### Now create the time table

In [16]:
# place into Temp View
user_log.createOrReplaceTempView("user_log_table") 

In [19]:
time_query = """
SELECT
    a.start_time,
    hour(a.start_time) as hour,
    dayofmonth(a.start_time) as day,
    weekofyear(a.start_time) as week,
    month(a.start_time) as month,
    year(a.start_time) as year,
    dayofweek(a.start_time) as weekday
    
FROM
    (SELECT DISTINCT 
        t1.datetime as start_time
            FROM user_log_table as t1
    ) as a 
ORDER BY a.start_time 
"""

In [20]:
# run query & show results 
spark.sql(time_query).show(n=3) 

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-01 20:57:...|  20|  1|  44|   11|2018|      5|
|2018-11-02 01:25:...|   1|  2|  44|   11|2018|      6|
|2018-11-03 01:04:...|   1|  3|  44|   11|2018|      7|
+--------------------+----+---+----+-----+----+-------+
only showing top 3 rows



### Now lets focus on `Users` table

we need:
- user_id
- first_name
- last_name
- gender
- level 

In [21]:
user_query = """
SELECT DISTINCT
    a.userID as user_id,
    a.firstName as first_name,
    a.lastName as last_name,
    a.gender,
    a.level
    
FROM user_log_table as a 
WHERE 
    a.userID IS NOT NULL 
AND a.ts = (
            SELECT MAX(b.ts) FROM user_log_table as b WHERE b.userID = a.userID
            )  
"""

In [23]:
# run query & show results 
spark.sql(user_query).show(n=3) 

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     37|    Jordan|    Hicks|     F| free|
|     15|      Lily|     Koch|     F| paid|
|     95|      Sara|  Johnson|     F| paid|
+-------+----------+---------+------+-----+
only showing top 3 rows



---------------------

#### Read in songplay JSON data

In [26]:
import os 

use the os.walk to go through each sub file and read all JSON files found

In [27]:
path = "data/songs/song_data/"
songs_list = [] 
for root, directories, files in os.walk(path):
    for file in files:
        filepath = os.path.join(root, file) 
        if filepath.endswith('.json'):
            songs_list.append(filepath) 

In [28]:
print(songs_list) 

['data/songs/song_data/A/A/A/TRAAAAW128F429D538.json', 'data/songs/song_data/A/A/A/TRAAAVG12903CFA543.json', 'data/songs/song_data/A/A/A/TRAAARJ128F9320760.json', 'data/songs/song_data/A/A/A/TRAAAPK128E0786D96.json', 'data/songs/song_data/A/A/A/TRAAABD128F429CF47.json', 'data/songs/song_data/A/A/A/TRAAAMO128F1481E7F.json', 'data/songs/song_data/A/A/A/TRAAAMQ128F1460CD3.json', 'data/songs/song_data/A/A/A/TRAAAVO128F93133D4.json', 'data/songs/song_data/A/A/A/TRAAAEF128F4273421.json', 'data/songs/song_data/A/A/A/TRAAAFD128F92F423A.json', 'data/songs/song_data/A/A/A/TRAAADZ128F9348C2E.json', 'data/songs/song_data/A/A/B/TRAABNV128F425CEE1.json', 'data/songs/song_data/A/A/B/TRAABRB128F9306DD5.json', 'data/songs/song_data/A/A/B/TRAABLR128F423B7E3.json', 'data/songs/song_data/A/A/B/TRAABDL12903CAABBA.json', 'data/songs/song_data/A/A/B/TRAABJL12903CDCF1A.json', 'data/songs/song_data/A/A/B/TRAABVM128F92CA9DC.json', 'data/songs/song_data/A/A/B/TRAABYN12903CFD305.json', 'data/songs/song_data/A/A/B

In [29]:
# set the path to all the files, use multiline option = true, read in all files
songs = spark.read.option("multiline","true").json(songs_list)  

In [30]:
songs.printSchema() 

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [None]:
spark.stop() 