In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 63kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=d32eeae9e1bdc355d11032f584d687e6e77dba48fb93e07c4070ba1a1d0b2f1c
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [6]:
spark = SparkSession.builder.appName('sparkify') \
    .config('spark.driver.maxResultSize', '3g') \
    .getOrCreate()

In [7]:
df = spark.read.parquet("/content/drive/MyDrive/datasets/dsnd-sparkify/event_labeled.parquet")
df = df.drop('userIdTemp')
df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- up_ts: timestamp (nullable = true)
 |-- down_ts: string (nullable = true)
 |-- isChurn: boolean (nullable = true)
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- ts: timestamp (nullable = true)



In [8]:
def change_colname_join_df(join_df, suffix='_temp'):
    '''
    INPUT:
    join_df - dataframe on the right side of join
    suffix - added string on each column name
    '''
    temp_df = join_df
    for col_name in join_df.columns:
        temp_df = temp_df.withColumnRenamed(col_name, col_name + suffix)
    
    return temp_df

In [9]:
# number of song heard in one subscription
n_song = df.filter(df.page == "NextSong").groupBy(["userId", "up_ts", "down_ts", "isChurn"]).count()
n_song.show(5)

+-------+-------------------+-------------------+-------+-----+
| userId|              up_ts|            down_ts|isChurn|count|
+-------+-------------------+-------------------+-------+-----+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|   90|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|  648|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false| 4114|
|1335330|2018-11-26 15:00:33|2099-12-31 00:00:00|  false|    3|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|  218|
+-------+-------------------+-------------------+-------+-----+
only showing top 5 rows



In [11]:
# number of day in subscription
maxdate_df = df.select("ts").agg(F.max(df.ts))

datediff_df = df.select(["userId", "up_ts", "down_ts", "isChurn"]).dropDuplicates() \
    .join(maxdate_df, ~df.userId.isNull(), how='left') \
    .withColumn("datediff", 
        F.datediff(F.when(F.col("isChurn"), F.col("down_ts")).otherwise(F.col("max(ts)").cast(T.TimestampType())), df.up_ts)) \
    .drop("max(ts)")

datediff_df.show(5)

+-------+-------------------+-------------------+-------+--------+
| userId|              up_ts|            down_ts|isChurn|datediff|
+-------+-------------------+-------------------+-------+--------+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|       3|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|      28|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false|      60|
|1335330|2018-11-26 15:00:33|2099-12-31 00:00:00|  false|       4|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|      50|
+-------+-------------------+-------------------+-------+--------+
only showing top 5 rows



In [12]:
# number of song played per day
jdf = change_colname_join_df(datediff_df)
song_rate = df.filter(df.page == 'NextSong').groupBy(["userId", "up_ts", "down_ts", "isChurn"]).count() \
    .withColumnRenamed("count", "song_count") \
    .join(jdf, (df.up_ts == jdf.up_ts_temp) & (df.userId == jdf.userId_temp)) \
    .drop("userId_temp", "up_ts_temp", "down_ts_temp", "isChurn_temp") \
    .withColumn("song_rate", F.col("song_count") / F.when(F.col("datediff_temp") == 0, 1).otherwise(F.col("datediff_temp")))

song_rate.show(5)

+-------+-------------------+-------------------+-------+----------+-------------+------------------+
| userId|              up_ts|            down_ts|isChurn|song_count|datediff_temp|         song_rate|
+-------+-------------------+-------------------+-------+----------+-------------+------------------+
|1049289|2018-10-02 02:15:05|2099-12-31 00:00:00|  false|      1951|           59|33.067796610169495|
|1804721|2018-10-02 04:15:22|2099-12-31 00:00:00|  false|       957|           59|16.220338983050848|
|1455101|2018-10-02 23:28:08|2018-10-26 19:06:03|   true|      1536|           24|              64.0|
|1548612|2018-10-03 11:39:40|2018-11-27 18:27:03|   true|       619|           55|11.254545454545454|
|1379264|2018-10-03 13:45:24|2018-11-10 13:02:22|   true|      1858|           38| 48.89473684210526|
+-------+-------------------+-------------------+-------+----------+-------------+------------------+
only showing top 5 rows



In [None]:
# number of songs added to playlist
df.select(["userId", "up_ts", "down_ts", "isChurn", "page"]).filter(df.page =="Add to Playlist") \
    .groupBy(["userId", "up_ts", "down_ts", "isChurn"]) \
    .agg(F.count(F.col("page"))) \
    .show(5)

+-------+-------------------+-------------------+-------+-----------+
| userId|              up_ts|            down_ts|isChurn|count(page)|
+-------+-------------------+-------------------+-------+-----------+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|          2|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|         12|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false|        113|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|          9|
|1293361|2018-11-16 21:48:35|2018-11-21 15:29:13|   true|          8|
+-------+-------------------+-------------------+-------+-----------+
only showing top 5 rows



In [28]:
# STILL A BIG PROBLEM
#number of thumbs up and down
df.select(["userId", "up_ts", "down_ts", "isChurn", "page"]).filter(df.page.isin(["Thumbs Up", "Thumbs Down"])) \
    .groupby(["userId", "up_ts", "down_ts", "isChurn"]) \
    .agg(
        # F.count(F.when(df.page == "Thumbs Up", True).otherwise(False)),
        F.count(F.when(df.page == "Thumbs Down", True).otherwise(False))
        ) \
    .show(5)

+-------+-------------------+-------------------+-------+--------------------------------------------------------------+
| userId|              up_ts|            down_ts|isChurn|count(CASE WHEN (page = Thumbs Down) THEN true ELSE false END)|
+-------+-------------------+-------------------+-------+--------------------------------------------------------------+
|1111091|2018-11-27 17:43:31|2099-12-31 00:00:00|  false|                                                             4|
|1161080|2018-10-26 19:14:14|2018-11-23 21:55:06|   true|                                                            32|
|1291366|2018-10-01 04:22:41|2099-12-31 00:00:00|  false|                                                           266|
|1721316|2018-10-11 17:19:44|2099-12-31 00:00:00|  false|                                                            16|
|1293361|2018-11-16 21:48:35|2018-11-21 15:29:13|   true|                                                            26|
+-------+-------------------+---

In [48]:
# average session length and number of session
df.groupBy(["userId", "up_ts", "sessionId"]) \
    .agg(
        F.min(df.ts).cast(T.LongType()).alias("min"),
        F.max(df.ts).cast(T.LongType()).alias("max")
    ) \
    .withColumn("diff", (F.col("max") - F.col("min"))) \
    .groupBy(["userId", "up_ts"]) \
    .agg(F.avg(F.col("diff")), F.count(F.col("sessionId"))) \
    .show()

+-------+-------------------+------------------+----------------+
| userId|              up_ts|         avg(diff)|count(sessionId)|
+-------+-------------------+------------------+----------------+
|1914133|2018-10-09 11:11:55|           22340.0|              12|
|1071843|2018-11-08 13:16:59|27020.272727272728|              11|
|1582360|2018-10-04 15:48:38|19012.117647058825|              34|
|1576394|2018-10-03 07:51:46| 19205.60606060606|              33|
|1957517|2018-10-11 04:46:39|          28964.75|              12|
|1738642|2018-10-23 16:00:35|16186.714285714286|               7|
|1968237|2018-10-27 11:19:26|           34908.8|               5|
|1480286|2018-10-05 08:08:07| 25844.91304347826|              23|
|1236500|2018-10-02 22:56:35|13711.457142857143|              35|
|1212815|2018-11-02 08:43:06|        36214.8125|              16|
|1450388|2018-10-29 10:57:02|           16987.0|               1|
|1166697|2018-10-14 22:06:54|           34100.5|               6|
|1740364|2

In [71]:
#device
df.select(["userId", "up_ts", "userAgent"]).withColumn("platform",
    F.when(df.userAgent.contains("Macintosh"), "macos") \
    .when(df.userAgent.contains("Windows"), "windows") \
    .when(df.userAgent.contains("iPad"), "ipad") \
    .when(df.userAgent.contains("iPhone"), "iphone") \
    .when(df.userAgent.contains("Linux"), "linux") \
    ).show()

+-------+-------------------+--------------------+--------+
| userId|              up_ts|           userAgent|platform|
+-------+-------------------+--------------------+--------+
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.0 (Mac...|   macos|
|1006381|2018-10-30 03:31:46|"Mozilla/5.

In [74]:
#state
df.select("location").withColumn("test", F.split(df.location, ', ')[1]).show()

+--------------------+----+
|            location|test|
+--------------------+----+
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
|Los Angeles-Long ...|  CA|
+--------------------+----+
only showing top 20 rows

