In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder.appName('sparkify') \
    .config('spark.driver.maxResultSize', '3g') \
    .getOrCreate()

In [3]:
df = spark.read.parquet("gs://udacity-dsnd/event_labeled.parquet")
df = df.drop('userIdTemp')
df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- up_ts: timestamp (nullable = true)
 |-- down_ts: string (nullable = true)
 |-- isChurn: boolean (nullable = true)
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- ts: timestamp (nullable = true)



In [5]:
def change_colname_join_df(join_df, suffix='_temp'):
    temp_df = join_df
    for col_name in join_df.columns:
        temp_df = temp_df.withColumnRenamed(col_name, col_name + suffix)
    
    return temp_df

In [4]:
# number of song heard in one subscription
n_song = df.filter(df.page == "NextSong").groupBy(["userId", "up_ts", "down_ts", "isChurn"]).count()
n_song.show(5)

+-------+-------------------+-------------------+-------+-----+
| userId|              up_ts|            down_ts|isChurn|count|
+-------+-------------------+-------------------+-------+-----+
|1481390|2018-11-12 14:13:37|2099-12-31 00:00:00|  false|  232|
|1221594|2018-11-24 21:57:34|2099-12-31 00:00:00|  false|  597|
|1630000|2018-10-09 17:04:56|2099-12-31 00:00:00|  false| 1408|
|1801448|2018-10-25 01:28:00|2099-12-31 00:00:00|  false|  460|
|1084874|2018-11-13 07:36:32|2099-12-31 00:00:00|  false|  596|
+-------+-------------------+-------------------+-------+-----+
only showing top 5 rows



In [6]:
# number of day in subscription
maxdate_df = df.select("ts").agg(F.max(df.ts))

datediff_df = df.select(["userId", "up_ts", "down_ts", "isChurn"]).dropDuplicates() \
    .join(maxdate_df, ~df.userId.isNull(), how='left') \
    .withColumn("datediff", 
        F.datediff(F.when(F.col("isChurn"), F.col("down_ts")).otherwise(F.col("max(ts)").cast(T.TimestampType())), df.up_ts)) \
    .drop("max(ts)") \

datediff_df.show(5)

+-------+-------------------+-------------------+-------+--------+
| userId|              up_ts|            down_ts|isChurn|datediff|
+-------+-------------------+-------------------+-------+--------+
|1481390|2018-11-12 14:13:37|2099-12-31 00:00:00|  false|      18|
|1221594|2018-11-24 21:57:34|2099-12-31 00:00:00|  false|       6|
|1630000|2018-10-09 17:04:56|2099-12-31 00:00:00|  false|      52|
|1801448|2018-10-25 01:28:00|2099-12-31 00:00:00|  false|      36|
|1084874|2018-11-13 07:36:32|2099-12-31 00:00:00|  false|      17|
+-------+-------------------+-------------------+-------+--------+
only showing top 5 rows



In [9]:
# number of song played per day
jdf = change_colname_join_df(datediff_df)
song_rate = df.filter(df.page == 'NextSong').groupBy(["userId", "up_ts", "down_ts", "isChurn"]).count() \
    .withColumnRenamed("count", "song_count") \
    .join(jdf, (df.up_ts == jdf.up_ts_temp) & (df.userId == jdf.userId_temp)) \
    .drop("userId_temp", "up_ts_temp", "down_ts_temp", "isChurn_temp") \
    .withColumn("song_rate", F.col("song_count") / F.when(F.col("datediff_temp") == 0, 1).otherwise(F.col("datediff_temp")))
song_rate.show(5)

+-------+-------------------+-------------------+-------+----------+-------------+------------------+
| userId|              up_ts|            down_ts|isChurn|song_count|datediff_temp|         song_rate|
+-------+-------------------+-------------------+-------+----------+-------------+------------------+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|        90|            3|              30.0|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|       648|           28|23.142857142857142|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false|      4114|           60| 68.56666666666666|
|1335330|2018-11-26 15:00:33|2099-12-31 00:00:00|  false|         3|            4|              0.75|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|       218|           50|              4.36|
+-------+-------------------+-------------------+-------+----------+-------------+------------------+
only showing top 5 rows



In [11]:
# number of songs added to playlist
df.select(["userId", "up_ts", "down_ts", "isChurn", "page"]).filter(df.page =="Add to Playlist") \
    .groupBy(["userId", "up_ts", "down_ts", "isChurn"]) \
    .agg(F.count(F.col("page"))) \
    .show(5)

+-------+-------------------+-------------------+-------+-----------+
| userId|              up_ts|            down_ts|isChurn|count(page)|
+-------+-------------------+-------------------+-------+-----------+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|          2|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|         12|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false|        113|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|          9|
|1293361|2018-11-16 21:48:35|2018-11-21 15:29:13|   true|          8|
+-------+-------------------+-------------------+-------+-----------+
only showing top 5 rows



In [None]:
#number of thumbs up and down
df.select(["userId", "up_ts", "down_ts", "isChurn", "page"]).filter(df.page.isin(["Thumbs Up", "Thumbs Down"])) \
    .gropuby(["userId", "up_ts", "down_ts", "isChurn"]) \
    .agg(
        F.count(F.when(df.page == "")))

In [12]:
df.select("page").filter(df.page != "NextSong").show(50)

+--------------------+
|                page|
+--------------------+
|      Submit Upgrade|
|                Home|
|          Add Friend|
|           Thumbs Up|
|     Add to Playlist|
|           Downgrade|
|              Cancel|
|Cancellation Conf...|
|                Home|
|          Add Friend|
|                Help|
|         Thumbs Down|
|     Add to Playlist|
|           Thumbs Up|
|     Add to Playlist|
|           Downgrade|
|                Home|
|          Add Friend|
|                Home|
|                Home|
|           Thumbs Up|
|         Thumbs Down|
|     Add to Playlist|
|     Add to Playlist|
|           Downgrade|
|           Thumbs Up|
|                Home|
|              Logout|
|                Home|
|         Thumbs Down|
|          Add Friend|
|           Thumbs Up|
|           Thumbs Up|
|                Home|
|           Downgrade|
|              Logout|
|                Home|
|                Help|
|                Home|
|           Thumbs Up|
|          