In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

## Explore dataset

#### data
- Location - /data/sample264
- Fields: trackId, userId, timestamp, artistId
    - trackId - id of the track
    - userId - id of the user
    - artistId - id of the artist
    - timestamp - timestamp of the moment the user starts listening to a track
  
#### meta
- Location - /data/meta
- Fields: type, Name, Artist, Id
    - Type - could be “track” or “artist”
    - Name - the title of the track if the type == “track” and the name of the musician or group if the type == “artist”.
    - Artist - states for the creator of the track in case the type == “track” and for the name of the musician or group in case the type == “artist”.
    - Id - id of the item

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

In [4]:
data.show(5)

+------+-------+--------+----------+
|userId|trackId|artistId| timestamp|
+------+-------+--------+----------+
| 13065| 944906|  978428|1501588527|
|101897| 799685|  989262|1501555608|
|215049| 871513|  988199|1501604269|
|309769| 857670|  987809|1501540265|
|397833| 903510|  994595|1501597615|
+------+-------+--------+----------+
only showing top 5 rows



In [5]:
meta.show(5)

+------+--------------------+--------------------+-------+
|  type|                Name|              Artist|     Id|
+------+--------------------+--------------------+-------+
| track|               Smile| Artist: Josh Groban|1223851|
| track|Chuni Ashkharhe Q...|Artist: Razmik Amyan|1215486|
| track|           Dark City|Artist: Machinae ...|1296462|
| track|       Not Sensitive|        Artist: Moby|1249694|
|artist|Artist: Carlos Pu...|Artist: Carlos Pu...|1352221|
+------+--------------------+--------------------+-------+
only showing top 5 rows



## Normalization could be done by next function

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [7]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

In [8]:
userTrack = data.groupBy(col("userId"), col("trackId")).count()

userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))

userTrackNorm.show(5)

+----+------+-------------------+
|  id|   id2|         norm_count|
+----+------+-------------------+
|3175|947718|0.05555555555555555|
|3175|940951|0.05555555555555555|
|3175|845631|0.05555555555555555|
|3175|864690|0.05555555555555555|
|3175|831005|0.05555555555555555|
+----+------+-------------------+
only showing top 5 rows



In [9]:
window = Window.orderBy(col("norm_count"))
    
userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\

userTrackList.show(5)

+------+------+
|    id|   id2|
+------+------+
|415763|853951|
|436158|889948|
|586043|800288|
|586043|800317|
|586043|801522|
+------+------+
only showing top 5 rows

