In [1]:
!python3 -m pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
  .appName('test') \
  .getOrCreate()

In [5]:
df = spark.read.json('/content/drive/MyDrive/datasets/mini_sparkify_event_data.json')

In [30]:
df1 = spark.read.json('/content/drive/MyDrive/datasets/sparkify_event_data.json')

In [31]:
df1.write.parquet("/content/drive/MyDrive/datasets/sparkify_event_data.parquet")

In [33]:
df3 = spark.read.parquet("/content/drive/MyDrive/datasets/sparkify_event_data.parquet")

In [35]:
df3.count()

26259199

In [36]:
df3.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [6]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [39]:
up_df = df3.select(["userId", "ts", "page"]).filter(df3.page == "Submit Upgrade")
down_df = df3.select(["userId", "ts", "page"]).filter(df3.page == "Submit Downgrade")

In [45]:
user_up_df.show()

+-------+-------------+--------------+
| userId|           ts|          page|
+-------+-------------+--------------+
|1000280|1539913798000|Submit Upgrade|
|1030587|1539712344000|Submit Upgrade|
|1033297|1543250688000|Submit Upgrade|
|1057724|1540466602000|Submit Upgrade|
|1069552|1541044964000|Submit Upgrade|
|1071308|1538504533000|Submit Upgrade|
|1083324|1538383967000|Submit Upgrade|
|1083324|1543324771000|Submit Upgrade|
|1114507|1539774588000|Submit Upgrade|
|1133196|1542767440000|Submit Upgrade|
|1133196|1540323757000|Submit Upgrade|
|1151194|1542238475000|Submit Upgrade|
|1200956|1541516443000|Submit Upgrade|
|1200956|1538640235000|Submit Upgrade|
|1271218|1538993088000|Submit Upgrade|
|1271218|1542263004000|Submit Upgrade|
|1271218|1539946427000|Submit Upgrade|
|1311711|1539018790000|Submit Upgrade|
|1338783|1538757622000|Submit Upgrade|
|1339632|1543035592000|Submit Upgrade|
+-------+-------------+--------------+
only showing top 20 rows



In [47]:
user_df = df3.select("userId").distinct()
user_up_df = user_df.join(up_df, up_df.userId == user_df.userId)
user_up_df = user_up_df.drop(up_df.userId)


+-------+-------------+--------------+-------+-------------+----------------+
| userId|           ts|          page| userId|           ts|            page|
+-------+-------------+--------------+-------+-------------+----------------+
|1000280|1539913798000|Submit Upgrade|1000280|1539974239000|Submit Downgrade|
+-------+-------------+--------------+-------+-------------+----------------+



In [48]:
import pyspark.sql.functions as F

In [59]:
up_df.groupby(up_df.userId).agg(F.count(up_df.userId)).show()

+-------+-------------+
| userId|count(userId)|
+-------+-------------+
|1133196|            2|
|1690101|            1|
|1658815|            1|
|1770964|            1|
|1828442|            1|
|1880560|            1|
|1349958|            1|
|1000280|            1|
|1853227|            4|
|1627009|            1|
|1775420|            1|
|1339632|            1|
|1517899|            2|
|1633577|            2|
|1863821|            1|
|1311711|            1|
|1803077|            1|
|1994878|            2|
|1359175|            2|
|1875484|            2|
+-------+-------------+
only showing top 20 rows



In [60]:
user_up_df.filter(user_up_df.userId == 1133196).show()

+-------+-------------+--------------+
| userId|           ts|          page|
+-------+-------------+--------------+
|1133196|1542767440000|Submit Upgrade|
|1133196|1540323757000|Submit Upgrade|
+-------+-------------+--------------+



In [53]:
down_df = down_df.withColumnRenamed("ts", "down_ts")

In [61]:
user_up_df.filter(user_up_df.userId == 1133196).join(down_df,
  (down_df.userId == user_up_df.userId) & 
  (down_df.down_ts > user_up_df.ts), how="left") \
  .show()

+-------+-------------+--------------+-------+-------------+----------------+
| userId|           ts|          page| userId|      down_ts|            page|
+-------+-------------+--------------+-------+-------------+----------------+
|1133196|1542767440000|Submit Upgrade|   null|         null|            null|
|1133196|1540323757000|Submit Upgrade|1133196|1541503005000|Submit Downgrade|
+-------+-------------+--------------+-------+-------------+----------------+



In [15]:
df.select(["page", "song", "registration", "itemInSession"]).show(50)

+---------------+--------------------+-------------+-------------+
|           page|                song| registration|itemInSession|
+---------------+--------------------+-------------+-------------+
|       NextSong|           Rockpools|1538173362000|           50|
|       NextSong|              Canada|1538331630000|           79|
|       NextSong|   Time For Miracles|1538173362000|           51|
|       NextSong|Knocking On Forbi...|1538331630000|           80|
|       NextSong|Harder Better Fas...|1538173362000|           52|
|       NextSong|      Don't Leave Me|1538331630000|           81|
|       NextSong|         Run Run Run|1538331630000|           82|
|       NextSong|Passengers (Old A...|1538173362000|           53|
|Add to Playlist|                null|1538173362000|           54|
|       NextSong|          Fuck Kitty|1538173362000|           55|
|       NextSong|   Walk On The Water|1538331630000|           83|
|    Roll Advert|                null|1538331630000|          

In [17]:
df.select("user").filter(df.page == "Submit Upgrade").

159

In [24]:
df.select(["userId","page","ts", "song"]) \
  .filter(df.userId == 30) \
  .filter(df.page != 'NextSong') \
  .sort("ts").show(150)

+------+----------------+-------------+----+
|userId|            page|           ts|song|
+------+----------------+-------------+----+
|    30| Add to Playlist|1538352905000|null|
|    30|     Thumbs Down|1538356524000|null|
|    30|       Thumbs Up|1538357649000|null|
|    30|       Thumbs Up|1538357991000|null|
|    30|     Roll Advert|1538360737000|null|
|    30| Add to Playlist|1538361575000|null|
|    30|      Add Friend|1538810631000|null|
|    30|            Home|1538948226000|null|
|    30|            Help|1538950549000|null|
|    30|            Home|1538950593000|null|
|    30|       Downgrade|1538953526000|null|
|    30|       Thumbs Up|1538962841000|null|
|    30|       Thumbs Up|1538963113000|null|
|    30|            Home|1538965154000|null|
|    30| Add to Playlist|1538966961000|null|
|    30|       Thumbs Up|1538967448000|null|
|    30| Add to Playlist|1538967453000|null|
|    30|       Thumbs Up|1538967968000|null|
|    30| Add to Playlist|1538967976000|null|
|    30| A

In [41]:
df.filter(df.page == 'Submit Downgrade').count()

63

In [36]:
df.filter(df.page.isin("Submit Downgrade")).select(["userId"]).distinct().show(100)

+------+
|userId|
+------+
|    54|
|    11|
|    30|
|    59|
|    85|
|    35|
|100008|
|100025|
|300002|
|    96|
|   100|
|    61|
|   131|
|   140|
|300004|
|300015|
|    77|
|100009|
|300011|
|100004|
|200003|
|100015|
|300023|
|100016|
|    95|
|    38|
|   103|
|    25|
|200009|
|200023|
|    92|
|    81|
|100012|
|200025|
|200020|
|   141|
|     9|
|    24|
|    20|
|200011|
|    49|
|300021|
|    39|
|    12|
|   109|
|    13|
|200019|
|    74|
|100018|
+------+



In [27]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=L5hRywgsOtrjcwAGDfeMgF4QfW0NTW&prompt=consent&access_type=offline&code_challenge=nLreFFNWDBJjodvpR1cohN6ehdlb9OCQPmIKP9YmJ_E&code_challenge_method=S256

Enter verification code: 4/1AY0e-g4GJKg3KXunOKbchK4S0qF0x-rtIfm8gPUrO_c3GLX4nLWK1E_mYDU

You are now logged in as [afahmi13@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [29]:
!gsutil cp gs://udacity-dsnd/sparkify_event_data.json /content/drive/MyDrive/datasets/sparkify_event_data.json

Copying gs://udacity-dsnd/sparkify_event_data.json...
/ [1 files][ 11.9 GiB/ 11.9 GiB]   50.1 MiB/s                                   
Operation completed over 1 objects/11.9 GiB.                                     
