In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Explore dataset
+ Location - /data/sample264
+ Fields: trackId, userId, timestamp, artistId
    - trackId - id of the track
    - userId - id of the user
    - artistId - id of the artist
    - timestamp - timestamp of the moment the user starts listening to a track


+ Location - /data/meta
+ Fields: type, Name, Artist, Id
    - Type could be “track” or “artist”
    - Name is the title of the track if the type == “track” and the name of the musician or group if the type == “artist”.
    - Artist states for the creator of the track in case the type == “track” and for the name of the musician or group in case the type == “artist”.
    - Id - id of the item

In [4]:
data.show(5)

+------+-------+--------+----------+
|userId|trackId|artistId| timestamp|
+------+-------+--------+----------+
| 13065| 944906|  978428|1501588527|
|101897| 799685|  989262|1501555608|
|215049| 871513|  988199|1501604269|
|309769| 857670|  987809|1501540265|
|397833| 903510|  994595|1501597615|
+------+-------+--------+----------+
only showing top 5 rows



In [5]:
meta.show(5)

+------+--------------------+--------------------+-------+
|  type|                Name|              Artist|     Id|
+------+--------------------+--------------------+-------+
| track|               Smile| Artist: Josh Groban|1223851|
| track|Chuni Ashkharhe Q...|Artist: Razmik Amyan|1215486|
| track|           Dark City|Artist: Machinae ...|1296462|
| track|       Not Sensitive|        Artist: Moby|1249694|
|artist|Artist: Carlos Pu...|Artist: Carlos Pu...|1352221|
+------+--------------------+--------------------+-------+
only showing top 5 rows



## Normalization could be done by next function

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

#### Examples

In [7]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

In [8]:
userTrack = data.groupBy(col("userId"), col("trackId")).count()

userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))

userTrackNorm.show(5)

+----+------+-------------------+
|  id|   id2|         norm_count|
+----+------+-------------------+
|3175|947718|0.05555555555555555|
|3175|940951|0.05555555555555555|
|3175|845631|0.05555555555555555|
|3175|864690|0.05555555555555555|
|3175|831005|0.05555555555555555|
+----+------+-------------------+
only showing top 5 rows



In [9]:
window = Window.orderBy(col("norm_count"))
    
userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\

userTrackList.show(5)

+------+------+
|    id|   id2|
+------+------+
|415763|853951|
|436158|889948|
|586043|800288|
|586043|800317|
|586043|801522|
+------+------+
only showing top 5 rows



## Tasks

In [10]:
from pyspark.sql.functions import col, desc, asc, collect_list

## Task 1
- Build the edges of the type “track-track”. To do it you will need to count the collaborative similarity between all the tracks: 
    - if a user has started listening to the tracks A and B together in the limited time interval (equal to 7 minutes), then you should add 1 to the weight of the edge from vertex A to vertex B (initial weight is equal to 0). 
    - For each track choose top 40 tracks ordered by weight similar to it and normalize weights of its edges (divide the weight of each edge on a summary of weights of all edges).

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataFrame.

#### |A| <= B
#### -B <= A <= B

In [11]:
SEVEN_MIN = 60 * 7

tracks_tracks = data \
    .alias('data_1').join(data.alias('data_2'), \
        (col('data_1.userId') == col('data_2.userId')) &
        (col('data_1.trackId') != col('data_2.trackId')) &
        (
            (col('data_1.timestamp') - col('data_2.timestamp') <= SEVEN_MIN) &
            (col('data_1.timestamp') - col('data_2.timestamp') >= -SEVEN_MIN)
        ), 'inner') \
    .select(
        col('data_1.trackId').alias('id1'),
        col('data_2.trackId').alias('id2')
    ) \
    .groupBy(col('id1'), col('id2')).count() \
    .orderBy(desc('count'))

tracks_tracks.show(5)

+------+------+-----+
|   id1|   id2|count|
+------+------+-----+
|870292|939606|  253|
|939606|870292|  253|
|854531|879259|  195|
|879259|854531|  195|
|933030|871513|  159|
+------+------+-----+
only showing top 5 rows



In [12]:
results = norm(tracks_tracks, 'id1', 'id2', 'count', 40) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

results.show(5)

+------+------+----------+
|   id1|   id2|norm_count|
+------+------+----------+
|798256|923706|       1.0|
|798319|837992|       1.0|
|798322|876562|       1.0|
|798331|827364|       1.0|
|798335|840741|       1.0|
+------+------+----------+
only showing top 5 rows



In [13]:
for row in results.take(40):
    print row['id1'], row['id2']

798256 923706
798319 837992
798322 876562
798331 827364
798335 840741
798374 816874
798375 810685
798379 812055
798380 840113
798396 817687
798398 926302
798405 867217
798443 905923
798457 918918
798460 891840
798461 940379
798470 840814
798474 963162
798477 883244
798485 955521
798505 905671
798545 949238
798550 936295
798626 845438
798691 818279
798692 898823
798702 811440
798704 937570
798725 933147
798738 894170
798745 799665
798782 956938
798801 950802
798820 890393
798833 916319
798865 962662
798931 893574
798946 946408
799012 809997
799024 935246


## Task 2
- Build the edges of the type “user-track”. Take the amount of times the track was listened by the user as the weight of the edge from the user's vertex to the track’s vertex. For each user take top-1000 and normalize them.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [14]:
users_tracks = data \
    .select(
        col('userId').alias('id1'),
        col('trackId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

users_tracks.show(5)

+------+------+-----+
|   id1|   id2|count|
+------+------+-----+
|668849|817132|  277|
|560428|950984|   94|
|767478|870292|   94|
|278647|940362|   87|
|770607|830615|   76|
+------+------+-----+
only showing top 5 rows



In [15]:
results = norm(users_tracks, 'id1', 'id2', 'count', 1000) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

results.show(5)

+---+------+----------+
|id1|   id2|norm_count|
+---+------+----------+
| 66|965774|       1.0|
|116|867268|       1.0|
|128|852564|       1.0|
|131|880170|       1.0|
|195|946408|       1.0|
+---+------+----------+
only showing top 5 rows



In [16]:
for row in results.take(40):
    print row['id1'], row['id2']

66 965774
116 867268
128 852564
131 880170
195 946408
215 860111
235 897176
300 857973
321 915545
328 943482
333 818202
346 864911
356 961308
428 943572
431 902497
445 831381
488 841340
542 815388
617 946395
649 901672
658 937522
662 881433
698 935934
708 952432
746 879259
747 879259
776 946408
784 806468
806 866581
811 948017
837 799685
901 871513
923 879322
934 940714
957 945183
989 878364
999 967768
1006 962774
1049 849484
1057 920458


## Task 3
- Build the edges of the type “user-artist”. Take the amount of times the user has listened to the artist’s tracks as the weight of the edge from the user’s vertex to the artist’s vertex. For each user take top-100 artists and normalize weights.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [17]:
users_artists = data \
    .select(
        col('userId').alias('id1'),
        col('artistId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

users_artists.show(5)

+------+-------+-----+
|   id1|    id2|count|
+------+-------+-----+
|668849| 994686|  277|
|436158|1003021|  142|
|442306|1001300|  107|
|560428| 975695|   94|
|767478| 991179|   94|
+------+-------+-----+
only showing top 5 rows



In [18]:
results = norm(users_artists, 'id1', 'id2', 'count', 100) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

results.show(5)

+---+-------+----------+
|id1|    id2|norm_count|
+---+-------+----------+
| 66| 993426|       1.0|
|116| 974937|       1.0|
|128|1003021|       1.0|
|131| 983068|       1.0|
|195| 997265|       1.0|
+---+-------+----------+
only showing top 5 rows



In [19]:
for row in results.take(40):
    print row['id1'], row['id2']

66 993426
116 974937
128 1003021
131 983068
195 997265
215 991696
235 990642
288 1000564
300 1003362
321 986172
328 967986
333 1000416
346 982037
356 974846
374 1003167
428 993161
431 969340
445 970387
488 970525
542 969751
612 987351
617 970240
649 973851
658 973232
662 975279
698 995788
708 968848
746 972032
747 972032
776 997265
784 969853
806 995126
811 996436
837 989262
901 988199
923 977066
934 990860
957 991171
989 975339
999 968823


## Task 4
- Build the edges of the type “artist-track”. Take the amount of times the track HAS BEEN listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. For each artist take top-100 tracks and normalize weights.

- Sort the resulting Data Frame in descending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in ascending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

In [20]:
artists_tracks = data \
    .select(
        col('artistId').alias('id1'),
        col('trackId').alias('id2')) \
    .groupBy('id1', 'id2').count() \
    .orderBy(desc('count'), asc('id1'), asc('id2'))

artists_tracks.show(5)

+------+------+-----+
|   id1|   id2|count|
+------+------+-----+
|987351|886091| 2958|
|988199|871513| 2904|
|997265|946408| 2836|
|981306|864690| 2582|
|974503|858904| 2453|
+------+------+-----+
only showing top 5 rows



In [21]:
results = norm(artists_tracks, 'id1', 'id2', 'count', 100) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

results.show(5)

+------+------+----------+
|   id1|   id2|norm_count|
+------+------+----------+
|967993|869415|       1.0|
|967998|947428|       1.0|
|968004|927380|       1.0|
|968017|859321|       1.0|
|968022|852786|       1.0|
+------+------+----------+
only showing top 5 rows



In [22]:
for row in results.take(40):
    print row['id1'], row['id2']

967993 869415
967998 947428
968004 927380
968017 859321
968022 852786
968034 807671
968038 964150
968042 835935
968043 913568
968046 935077
968047 806127
968065 907906
968073 964586
968086 813446
968092 837129
968118 914441
968125 821410
968140 953008
968148 877445
968161 809793
968163 803065
968168 876119
968189 858639
968221 896937
968224 892880
968232 825536
968237 932845
968238 939177
968241 879045
968242 911250
968248 953554
968255 808494
968259 880230
968265 950148
968266 824437
968269 913243
968272 816049
968278 946743
968285 847460
968286 940006


## Task 5
- Construct balancing function where the edges of the type “user-track” and the edges of the type “user-artist” influence the final recommendations equally.

- For the user with Id 776748 find all the tracks and artists connected to him. Sort founded items first by artist then by name in ascending order, leave only columns ”Artist” and “Name” and print top-40



In [23]:
my_user = 776748

In [24]:
map_user_track = users_tracks \
    .alias('users_tracks').join(meta.alias('meta'), \
        (col('users_tracks.id2') == col('meta.id')) &
        (col('meta.type') == 'track')) \
    .filter(col('users_tracks.id1') == my_user)


map_user_track.show(5)

+------+-------+-----+-----+--------------------+--------------------+-------+
|   id1|    id2|count| type|                Name|              Artist|     Id|
+------+-------+-----+-----+--------------------+--------------------+-------+
|776748|1226775|    3|track|         Kill The DJ|   Artist: Green Day|1226775|
|776748|1197701|    2|track|    Nothing Going On|  Artist: Clawfinger|1197701|
|776748|1238423|    2|track|   Come Out and Play|Artist: The Offsp...|1238423|
|776748|1299891|    2|track|             21 Guns|   Artist: Green Day|1299891|
|776748|1160421|    1|track|Hard Rock Hallelujah|       Artist: Lordi|1160421|
+------+-------+-----+-----+--------------------+--------------------+-------+
only showing top 5 rows



In [25]:
map_user_artist = users_artists \
    .alias('users_artists').join(meta.alias('meta'), \
        (col('users_artists.id2') == col('meta.id')) &
        (col('meta.type') == 'artist')) \
    .filter(col('users_artists.id1') == my_user)


map_user_artist.show(5)

+------+-------+-----+------+--------------------+--------------------+-------+
|   id1|    id2|count|  type|                Name|              Artist|     Id|
+------+-------+-----+------+--------------------+--------------------+-------+
|776748|1358867|    5|artist|   Artist: Green Day|   Artist: Green Day|1358867|
|776748|1330944|    2|artist| Artist: Linkin Park| Artist: Linkin Park|1330944|
|776748|1343667|    2|artist|  Artist: Clawfinger|  Artist: Clawfinger|1343667|
|776748|1358472|    2|artist|Artist: The Offsp...|Artist: The Offsp...|1358472|
|776748|1331090|    1|artist|Artist: Serj Tankian|Artist: Serj Tankian|1331090|
+------+-------+-----+------+--------------------+--------------------+-------+
only showing top 5 rows



In [26]:
results =  \
    map_user_track.select('Name', 'Artist') \
        .union( \
    map_user_artist.select('Name', 'Artist')) \
        .orderBy(asc('Artist'), asc('Name'))

results.show(5)

+--------------------+--------------------+
|                Name|              Artist|
+--------------------+--------------------+
|Artist: 3 Doors Down|Artist: 3 Doors Down|
|          Kryptonite|Artist: 3 Doors Down|
|         Artist: 311|         Artist: 311|
|  Beautiful disaster|         Artist: 311|
|        Artist: Blur|        Artist: Blur|
+--------------------+--------------------+
only showing top 5 rows



In [27]:
for row in results.take(40):
    print row['Artist'], row['Name']

Artist: 3 Doors Down Artist: 3 Doors Down
Artist: 3 Doors Down Kryptonite
Artist: 311 Artist: 311
Artist: 311 Beautiful disaster
Artist: Blur Artist: Blur
Artist: Blur Girls and Boys
Artist: Clawfinger Artist: Clawfinger
Artist: Clawfinger Nothing Going On
Artist: Disturbed Artist: Disturbed
Artist: Disturbed The Vengeful One
Artist: Gotthard Artist: Gotthard
Artist: Gotthard Eagle
Artist: Green Day 21 Guns
Artist: Green Day Artist: Green Day
Artist: Green Day Kill The DJ
Artist: Iggy Pop Artist: Iggy Pop
Artist: Iggy Pop Sunday
Artist: Korn Artist: Korn
Artist: Korn Here To Stay
Artist: Linkin Park Artist: Linkin Park
Artist: Linkin Park In The End
Artist: Linkin Park Numb
Artist: Lordi Artist: Lordi
Artist: Lordi Hard Rock Hallelujah
Artist: Nickelback Artist: Nickelback
Artist: Nickelback She Keeps Me Up
Artist: Nomy Artist: Nomy
Artist: Nomy Cocaine
Artist: Papa Roach Artist: Papa Roach
Artist: Papa Roach Getting Away With Murder
Artist: Rise Against Artist: Rise Against
Artist: Ri

## Task 6
- For the user with Id 776748 print top-40 recommended tracks. Build music recommendations with the algorithm described in the lesson 3 of the fifth week. Initialize coordinates of vector x_0 corresponding to the user’s vertex and vertices from the task 5 with ones and all other coordinates with zeros. Do 5 steps:
![](./img/Recommender.jpg)
- You should receive a table with 3 collumns: “name”, “artist” and “rank”. Sort the resulting dataframe in descending order by “rank”, select top 40 recommended tracks, select only the columns “name”, “artist” and “rank”, leave 5 digits after the decimal point in “rank” and print the resulting dataframe.

In [28]:
my_user = 776748