In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Explore dataset
+ Location - /data/sample264
+ Fields: trackId, userId, timestamp, artistId
    - trackId - id of the track
    - userId - id of the user
    - artistId - id of the artist
    - timestamp - timestamp of the moment the user starts listening to a track


+ Location - /data/meta
+ Fields: type, Name, Artist, Id
    - Type could be “track” or “artist”
    - Name is the title of the track if the type == “track” and the name of the musician or group if the type == “artist”.
    - Artist states for the creator of the track in case the type == “track” and for the name of the musician or group in case the type == “artist”.
    - Id - id of the item

In [4]:
# data.show(5)

In [5]:
# meta.show(5)

## Normalization could be done by next function

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

#### Examples

In [7]:
# from pyspark.sql import Window
# from pyspark.sql.functions import col, rank

In [8]:
# userTrack = data.groupBy(col("userId"), col("trackId")).count()

# userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
#         .withColumn("id", col("userId")) \
#         .withColumn("id2", col("trackId")) \
#         .withColumn("norm_count", col("norm_count") * 0.5) \
#         .select(col("id"), col("id2"), col("norm_count"))

# userTrackNorm.show(5)

In [9]:
# window = Window.orderBy(col("norm_count"))
    
# userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
#     .filter(col("position") < 50)\
#     .orderBy(col("id"), col("id2"))\
#     .select(col("id"), col("id2"))\

# userTrackList.show(5)

## Tasks

In [10]:
from pyspark.sql.functions import col, desc, asc, collect_list

## Task 1
- Build the edges of the type “track-track”. To do it you will need to count the collaborative similarity between all the tracks: 
    - if a user has started listening to the tracks A and B together in the limited time interval (equal to 7 minutes), then you should add 1 to the weight of the edge from vertex A to vertex B (initial weight is equal to 0). 
    - For each track choose top 40 tracks ordered by weight similar to it and normalize weights of its edges (divide the weight of each edge on a summary of weights of all edges).

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataFrame.

#### |A| <= B
#### -B <= A <= B

In [11]:
SEVEN_MIN = 60 * 7

tracks_tracks = data \
    .alias('data_1').join(data.alias('data_2'), \
        (col('data_1.userId') == col('data_2.userId')) &
        (col('data_1.trackId') != col('data_2.trackId')) &
        (
            (col('data_1.timestamp') - col('data_2.timestamp') <= SEVEN_MIN) &
            (col('data_1.timestamp') - col('data_2.timestamp') >= -SEVEN_MIN)
        ), 'inner') \
    .select(
        col('data_1.trackId').alias('id1'),
        col('data_2.trackId').alias('id2')
    ) \
    .groupBy(col('id1'), col('id2')).count() \
    .orderBy(desc('count'))

# tracks_tracks.show(5) 

In [12]:
tracks_tracks_norm = norm(tracks_tracks, 'id1', 'id2', 'count', 40) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

# tracks_tracks_norm.show(5)

In [13]:
# for row in tracks_tracks_norm.take(40):
#     print row['id1'], row['id2']

## Task 2
- Build the edges of the type “user-track”. Take the amount of times the track was listened by the user as the weight of the edge from the user's vertex to the track’s vertex. For each user take top-1000 and normalize them.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [14]:
users_tracks = data \
    .select(
        col('userId').alias('id1'),
        col('trackId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

# users_tracks.show(5)

In [15]:
users_tracks_norm = norm(users_tracks, 'id1', 'id2', 'count', 1000) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

# users_tracks_norm.show(5)

In [16]:
# for row in users_tracks_norm.take(40):
#     print row['id1'], row['id2']

## Task 3
- Build the edges of the type “user-artist”. Take the amount of times the user has listened to the artist’s tracks as the weight of the edge from the user’s vertex to the artist’s vertex. For each user take top-100 artists and normalize weights.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [17]:
users_artists = data \
    .select(
        col('userId').alias('id1'),
        col('artistId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

# users_artists.show(5) 

In [18]:
users_artists_norm = norm(users_artists, 'id1', 'id2', 'count', 100) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

# users_artists_norm.show(5)

In [19]:
# for row in users_artists_norm.take(40):
#     print row['id1'], row['id2']

## Task 4
- Build the edges of the type “artist-track”. Take the amount of times the track HAS BEEN listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. For each artist take top-100 tracks and normalize weights.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [20]:
artists_tracks = data \
    .select(
        col('artistId').alias('id1'),
        col('trackId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

# artists_tracks.show(5)

In [21]:
artists_tracks_norm = norm(artists_tracks, 'id1', 'id2', 'count', 100) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

# artists_tracks_norm.show(5)

In [22]:
# for row in artists_tracks_norm.take(40):
#     print row['id1'], row['id2']

## Task 5
- Construct balancing function where the edges of the type “user-track” and the edges of the type “user-artist” influence the final recommendations equally.

- For the user with Id 776748 find all the tracks and artists connected to him. Sort founded items first by artist then by name in ascending order, leave only columns ”Artist” and “Name” and print top-40



In [23]:
my_user = 776748

In [24]:
map_user_track = users_tracks \
    .alias('users_tracks').join(meta.alias('meta'),
        (col('users_tracks.id2') == col('meta.id')) &
        (col('meta.type') == 'track')) \
    .filter(col('users_tracks.id1') == my_user)


# map_user_track.show(5)

In [25]:
map_user_artist = users_artists \
    .alias('users_artists').join(meta.alias('meta'), \
        (col('users_artists.id2') == col('meta.id')) &
        (col('meta.type') == 'artist')) \
    .filter(col('users_artists.id1') == my_user)


# map_user_artist.show(5)

In [26]:
results =  \
    map_user_track.select('Name', 'Artist') \
        .union(
    map_user_artist.select('Name', 'Artist')) \
        .orderBy(asc('Artist'), asc('Name'))

# results.show(5)

In [27]:
# for row in results.take(40):
#     print row['Artist'], row['Name']

## Task 6
- For the user with Id 776748 print top-40 recommended tracks. Build music recommendations with the algorithm described in the lesson 3 of the fifth week. Initialize coordinates of vector x_0 corresponding to the user’s vertex and vertices from the task 5 with ones and all other coordinates with zeros. Do 5 steps:

![](./img/Recommender.jpg)

- You should receive a table with 3 collumns: “name”, “artist” and “rank”. Sort the resulting dataframe in descending order by “rank”, select top 40 recommended tracks, select only the columns “name”, “artist” and “rank”, leave 5 digits after the decimal point in “rank” and print the resulting dataframe.

In [28]:
my_user = 776748
alpha = 0.15
steps = 5

In [29]:
from pyspark.sql.functions import lit, least, sum

# x = (0,...,0,1,1,1,1,0,...,0): 1.0 if user, init Probability
x = \
    data.select(
        col('userId').alias('id'),
        lit(1.0).alias('w')).distinct() \
    .union(
    data.select(
        col('trackId').alias('id'),
        lit(0.0).alias('w')).distinct()) \
    .union(
    data.select(
        col('artistId').alias('id'),
        lit(0.0).alias('w')).distinct()).cache()


x.show(5)

+------+---+
|    id|  w|
+------+---+
|584813|1.0|
|343869|1.0|
|274288|1.0|
|682564|1.0|
|506870|1.0|
+------+---+
only showing top 5 rows



In [30]:
# # u = (0,...,0,0,0,1,0,0,...,0): 1 if user = my user
# u = x \
#     .select(
#         col('id'),
#         col('w').cast('Int')).cache()

# u.show(5)

+------+---+
|    id|  w|
+------+---+
|584813|  1|
|343869|  1|
|274288|  1|
|682564|  1|
|506870|  1|
+------+---+
only showing top 5 rows



In [31]:
# u = (0,...,0,0,0,1,0,0,...,0): 1 if user = my user
u = x \
    .select(
        col('id'),
        lit(0).alias('w')) \
    .filter(col('id') != my_user)

u_my_user = x \
    .filter(col('id') == my_user) \
    .select(
        col('id'),
        lit(1).alias('w'))

u = u.union(u_my_user).cache()
u.show(5)

In [32]:
# Build edges
edges = \
    tracks_tracks_norm \
        .union(
    users_tracks_norm) \
        .union(
    users_artists_norm) \
        .union(
    users_artists_norm) \
        .union(
    artists_tracks_norm) \
        .cache()

edges.show(5)

+------+------+----------+
|   id1|   id2|norm_count|
+------+------+----------+
|798256|923706|       1.0|
|798319|837992|       1.0|
|798322|876562|       1.0|
|798331|827364|       1.0|
|798335|840741|       1.0|
+------+------+----------+
only showing top 5 rows



In [33]:
def calculate_next_x(x, u, edges):
    next_v = \
        edges \
            .join( 
        x, (col('id2') == col('id')), 'left_outer').drop('id') \
            .withColumn('mul', col('norm_count') * col('w')) \
            .groupBy(col('id1')) \
            .agg(sum('mul').alias('sum')) \
            .cache()

    next_x = \
        u.withColumn('a', col('w') * alpha) \
            .join(
        next_v, col('id') == col('id1')) \
            .withColumn('b', col('sum') * (1 - alpha)) \
            .withColumn('res', col('a') + col('b')) \
            .select(
                'id',
                col('res').alias('w')) \
            .cache()
    
    res_x = \
        x.alias('x') \
            .join(
        next_x.alias('next_x'), col('x.id') == col('next_x.id'), 'left_outer') \
            .select(
                col('x.id').alias('id'),
                least(col('next_x.w'), col('x.w')).alias('w')) \
            .cache()
    
    return res_x


for i in range(steps):
    x = calculate_next_x(x, u, edges)

x.orderBy(desc('w')).show(5)

+----+----+
|  id|   w|
+----+----+
|3175|0.15|
|5518|0.15|
|5803|0.15|
|6654|0.15|
|7253|0.15|
+----+----+
only showing top 5 rows



In [34]:
meta.show(5)

+------+--------------------+--------------------+-------+
|  type|                Name|              Artist|     Id|
+------+--------------------+--------------------+-------+
| track|               Smile| Artist: Josh Groban|1223851|
| track|Chuni Ashkharhe Q...|Artist: Razmik Amyan|1215486|
| track|           Dark City|Artist: Machinae ...|1296462|
| track|       Not Sensitive|        Artist: Moby|1249694|
|artist|Artist: Carlos Pu...|Artist: Carlos Pu...|1352221|
+------+--------------------+--------------------+-------+
only showing top 5 rows



In [35]:
results = \
    meta.alias('meta') \
        .join(
    x.alias('x'), col('meta.Id') == col('x.id')) \
        .orderBy(desc('w')) \
        .distinct() \
        .select(
            col('Artist'),
            col('Name'),
            col('w'))

results.show(5)

+--------------------+--------------------+---+
|              Artist|                Name|  w|
+--------------------+--------------------+---+
|        Artist: Blur|        Artist: Blur|0.0|
|Artist: Three Day...|I Hate Everything...|0.0|
|Artist: The Offsp...|   Come Out and Play|0.0|
|        Artist: Blur|      Girls and Boys|0.0|
|    Artist: Iggy Pop|    Artist: Iggy Pop|0.0|
+--------------------+--------------------+---+
only showing top 5 rows



In [37]:
for row in results.take(40):
    print '{r.Name} {r.Artist} {r.w:0.5f}'.format(r=row)

Artist: Blur Artist: Blur 0.00000
I Hate Everything About You Artist: Three Days Grace 0.00000
Come Out and Play Artist: The Offspring 0.00000
Girls and Boys Artist: Blur 0.00000
Artist: Iggy Pop Artist: Iggy Pop 0.00000
Kill The DJ Artist: Green Day 0.00000
Getting Away With Murder Artist: Papa Roach 0.00000
Take It Out On Me Artist: Thousand Foot Krutch 0.00000
Artist: Three Days Grace Artist: Three Days Grace 0.00000
Sky is Over Artist: Serj Tankian 0.00000
Artist: The Offspring Artist: The Offspring 0.00000
Artist: Rise Against Artist: Rise Against 0.00000
In The End Artist: Linkin Park 0.00000
Eagle Artist: Gotthard 0.00000
Sunday Artist: Iggy Pop 0.00000
Artist: Linkin Park Artist: Linkin Park 0.00000
Artist: Lordi Artist: Lordi 0.00000
Artist: 3 Doors Down Artist: 3 Doors Down 0.00000
Beautiful disaster Artist: 311 0.00000
Artist: Clawfinger Artist: Clawfinger 0.00000
Numb Artist: Linkin Park 0.00000
Kryptonite Artist: 3 Doors Down 0.00000
Artist: Slipknot Artist: Slipknot 0.000