In [1]:
!python3 -m pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 68kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 45.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=ab430d9ffab8faf3f55daf290313a5925da7fe04f0c4b490444c9d5cb2f7bfa0
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [4]:
spark = SparkSession.builder \
  .appName('test') \
  .getOrCreate()

In [5]:
df = spark.read.parquet("/content/drive/MyDrive/datasets/sparkify_event_data.parquet")
df = df.withColumnRenamed("ts","ts_temp").withColumn("ts", (F.col("ts_temp") / 1000).cast(T.TimestampType())).drop("ts_temp")
df.cache()

DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: bigint, lastName: string, length: double, level: string, location: string, method: string, page: string, registration: bigint, sessionId: bigint, song: string, status: bigint, userAgent: string, userId: string, ts: timestamp]

In [6]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- ts: timestamp (nullable = true)



In [7]:
up_df = df.select(["userId", "ts"]) \
  .filter(df.page == "Submit Upgrade") \
  .withColumnRenamed('ts', 'up_ts')

down_df = df.select(["userId", "ts"]) \
  .filter(df.page == "Submit Downgrade") \
  .withColumnRenamed('ts', 'down_ts')


In [8]:
user_df = df.select("userId").distinct()
user_up_df = user_df.join(up_df, up_df.userId == user_df.userId).drop(up_df.userId)

In [11]:
up_df.filter(up_df.userId == 1853227).show()

+-------+-------------------+
| userId|              up_ts|
+-------+-------------------+
|1853227|2018-11-30 16:10:09|
|1853227|2018-11-04 15:35:06|
|1853227|2018-11-16 14:39:31|
|1853227|2018-10-07 10:33:02|
+-------+-------------------+



In [12]:
user_up_df.filter(up_df.userId == 1853227).show()

+-------+-------------------+
| userId|              up_ts|
+-------+-------------------+
|1853227|2018-11-30 16:10:09|
|1853227|2018-11-04 15:35:06|
|1853227|2018-11-16 14:39:31|
|1853227|2018-10-07 10:33:02|
+-------+-------------------+



In [None]:
up_df.groupby(up_df.userId).agg(F.count(up_df.userId)).show()

+-------+-------------+
| userId|count(userId)|
+-------+-------------+
|1133196|            2|
|1690101|            1|
|1658815|            1|
|1770964|            1|
|1828442|            1|
|1880560|            1|
|1349958|            1|
|1000280|            1|
|1853227|            4|
|1627009|            1|
|1775420|            1|
|1339632|            1|
|1517899|            2|
|1633577|            2|
|1863821|            1|
|1311711|            1|
|1803077|            1|
|1994878|            2|
|1359175|            2|
|1875484|            2|
+-------+-------------+
only showing top 20 rows



In [16]:
down_df.filter(down_df.userId == 1133196).show()

+-------+-------------------+
| userId|            down_ts|
+-------+-------------------+
|1133196|2018-11-06 11:16:45|
+-------+-------------------+



In [40]:
up_df.filter(up_df.userId.isNull()).show()

+------+-----+
|userId|up_ts|
+------+-----+
+------+-----+



In [59]:
#where magic happen
down_df = down_df.withColumnRenamed("userId", "userIdTemp")
key_df = up_df.join(down_df,
  (down_df.userIdTemp == up_df.userId) & 
  (down_df.down_ts > up_df.up_ts), how="left") \
  .drop(F.col("userIdTemp")) \
  .groupBy(F.col("userId"), up_df.up_ts) \
  .agg(F.min(down_df.down_ts)) \
  .withColumnRenamed("max(userId)", "userId") \
  .withColumn("down_ts", 
    F.when(F.col("min(down_ts)").isNull(), '2099-12-31 00:00:00') \
    .otherwise(F.col("min(down_ts)"))) \
  .withColumn("isChurn", 
    F.when(F.col("min(down_ts)").isNull(), False).otherwise(True)) \
  .orderBy(up_df.up_ts)

In [56]:
key_df.count()

15135

In [60]:
key_df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- up_ts: timestamp (nullable = true)
 |-- min(down_ts): timestamp (nullable = true)
 |-- down_ts: string (nullable = true)
 |-- isChurn: boolean (nullable = false)



In [49]:
up_df.join(down_df,
  (down_df.userId == up_df.userId) & 
  (down_df.down_ts > up_df.up_ts), how="left").show()

+-------+-------------------+-------+-------------------+
| userId|              up_ts| userId|            down_ts|
+-------+-------------------+-------+-------------------+
|1000280|2018-10-19 01:49:58|1000280|2018-10-19 18:37:19|
|1030587|2018-10-16 17:52:24|   null|               null|
|1033297|2018-11-26 16:44:48|   null|               null|
|1057724|2018-10-25 11:23:22|   null|               null|
|1069552|2018-11-01 04:02:44|1069552|2018-11-10 14:23:27|
|1071308|2018-10-02 18:22:13|   null|               null|
|1083324|2018-10-01 08:52:47|1083324|2018-11-13 23:51:05|
|1083324|2018-11-27 13:19:31|   null|               null|
|1114507|2018-10-17 11:09:48|   null|               null|
|1133196|2018-11-21 02:30:40|   null|               null|
|1133196|2018-10-23 19:42:37|1133196|2018-11-06 11:16:45|
|1151194|2018-11-14 23:34:35|   null|               null|
|1200956|2018-11-06 15:00:43|   null|               null|
|1200956|2018-10-04 08:03:55|1200956|2018-10-09 21:43:36|
|1271218|2018-

In [44]:
df.filter(df.ts == '2018-10-01 00:02:02').select(["userId", "page"]).show()

+-------+--------------+
| userId|          page|
+-------+--------------+
|1531101|      NextSong|
|1712107|Submit Upgrade|
+-------+--------------+



In [None]:
df.withColumn("date", (df.ts / 1000).cast(T.TimestampType())) \
  .show(10)

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+-------+-------------------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|           page| registration|sessionId|                song|status|           ts|           userAgent| userId|               date|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+-------+-------------------+
|              Redman|Logged In|   Nathan|     M|          496|   Turner|154.53995| paid|       Paragould, AR|   PUT|       NextSong|1537383771000|   228824|        Smoke Buddah|   200|1543616036000|"Mozilla/5.0 (Mac...|1768369|2018-11-30 22:13:56|
|   

In [None]:
df.withColumn("test", (df.ts / 1000).cast(T.IntegerType())) \
  .withColumn("test1", F.col("test").cast(T.TimestampType())) \
  .sort(F.col("test1"), ascending=False) \
  .show(100)

+--------------------+----------+---------+------+-------------+----------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+-------+----------+-------------------+
|              artist|      auth|firstName|gender|itemInSession|  lastName|   length|level|            location|method|           page| registration|sessionId|                song|status|           ts|           userAgent| userId|      test|              test1|
+--------------------+----------+---------+------+-------------+----------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+-------+----------+-------------------+
|             Shakira| Logged In|  Freddie|     M|           60|    Juarez|322.79465| free|Houston-The Woodl...|   PUT|       NextSong|1535904972000|    19979|               TÃÂº|   200|1543622402000|"Mozilla/5.0 

In [None]:
df.filter(df.userId == 1517899) \
  .select(["page", "song", "length", "ts", "itemInSession", "sessionId"]) \
  .withColumn("date", F.col("ts").cast(T.TimestampType())) \
  .sort(df.ts) \
  .show(200)

+---------------+--------------------+---------+-------------+-------------+---------+--------------------+
|           page|                song|   length|           ts|itemInSession|sessionId|                date|
+---------------+--------------------+---------+-------------+-------------+---------+--------------------+
|           Home|                null|     null|1538652632000|            1|      445|+50727-12-18 12:5...|
|       NextSong|           Auto Rock|260.64934|1538652649000|            2|      445|+50727-12-18 17:3...|
|Add to Playlist|                null|     null|1538652653000|            3|      445|+50727-12-18 18:4...|
|       NextSong|Love Gone To Wast...|253.88363|1538652909000|            4|      445|+50727-12-21 17:5...|
|       NextSong|         Ups & Downs|207.25506|1538653162000|            5|      445|+50727-12-24 16:0...|
|      Thumbs Up|                null|     null|1538653163000|            6|      445|+50727-12-24 16:2...|
|       NextSong|     Heads 

In [None]:
df.select("user").filter(df.page == "Submit Upgrade").

159

In [None]:
df.select(["userId","page","ts", "song"]) \
  .filter(df.userId == 30) \
  .filter(df.page != 'NextSong') \
  .sort("ts").show(150)

+------+----------------+-------------+----+
|userId|            page|           ts|song|
+------+----------------+-------------+----+
|    30| Add to Playlist|1538352905000|null|
|    30|     Thumbs Down|1538356524000|null|
|    30|       Thumbs Up|1538357649000|null|
|    30|       Thumbs Up|1538357991000|null|
|    30|     Roll Advert|1538360737000|null|
|    30| Add to Playlist|1538361575000|null|
|    30|      Add Friend|1538810631000|null|
|    30|            Home|1538948226000|null|
|    30|            Help|1538950549000|null|
|    30|            Home|1538950593000|null|
|    30|       Downgrade|1538953526000|null|
|    30|       Thumbs Up|1538962841000|null|
|    30|       Thumbs Up|1538963113000|null|
|    30|            Home|1538965154000|null|
|    30| Add to Playlist|1538966961000|null|
|    30|       Thumbs Up|1538967448000|null|
|    30| Add to Playlist|1538967453000|null|
|    30|       Thumbs Up|1538967968000|null|
|    30| Add to Playlist|1538967976000|null|
|    30| A

In [None]:
df.filter(df.page == 'Submit Downgrade').count()

63

In [None]:
df.filter(df.page.isin("Submit Downgrade")).select(["userId"]).distinct().show(100)

+------+
|userId|
+------+
|    54|
|    11|
|    30|
|    59|
|    85|
|    35|
|100008|
|100025|
|300002|
|    96|
|   100|
|    61|
|   131|
|   140|
|300004|
|300015|
|    77|
|100009|
|300011|
|100004|
|200003|
|100015|
|300023|
|100016|
|    95|
|    38|
|   103|
|    25|
|200009|
|200023|
|    92|
|    81|
|100012|
|200025|
|200020|
|   141|
|     9|
|    24|
|    20|
|200011|
|    49|
|300021|
|    39|
|    12|
|   109|
|    13|
|200019|
|    74|
|100018|
+------+



In [None]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=L5hRywgsOtrjcwAGDfeMgF4QfW0NTW&prompt=consent&access_type=offline&code_challenge=nLreFFNWDBJjodvpR1cohN6ehdlb9OCQPmIKP9YmJ_E&code_challenge_method=S256

Enter verification code: 4/1AY0e-g4GJKg3KXunOKbchK4S0qF0x-rtIfm8gPUrO_c3GLX4nLWK1E_mYDU

You are now logged in as [afahmi13@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [None]:
!gsutil cp gs://udacity-dsnd/sparkify_event_data.json /content/drive/MyDrive/datasets/sparkify_event_data.json

Copying gs://udacity-dsnd/sparkify_event_data.json...
/ [1 files][ 11.9 GiB/ 11.9 GiB]   50.1 MiB/s                                   
Operation completed over 1 objects/11.9 GiB.                                     
