# Building a song recommender


## Fire up GraphLab Create

In [6]:
import graphlab
graphlab.canvas.set_target('ipynb')

# Load music data

In [8]:
song_data = graphlab.SFrame('song_data.gl/')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [19]:
song_data.show()

##Showing the most popular songs in the dataset

In [18]:
song_data['song'].show()

In [26]:
print len(song_data)
users = song_data['user_id'].unique()
print len(users)
songs = song_data['song_id'].unique()
print len(songs)
songs_byName = song_data['song'].unique()
print len(songs_byName)

1116609
66346
10000
9952


In [28]:
users_Kanye_West = song_data[song_data['artist'] == 'Kanye West']['user_id'].unique()
len(users_Kanye_West)

2522

In [29]:
users_Foo_Fighters = song_data[song_data['artist'] == 'Foo Fighters']['user_id'].unique()
len(users_Foo_Fighters)

2055

In [32]:
users_Taylor_Swift = song_data[song_data['artist'] == 'Taylor Swift']['user_id'].unique()
len(users_Taylor_Swift)

3246

In [33]:
users_Lady_GaGa = song_data[song_data['artist'] == 'Lady GaGa']['user_id'].unique()
len(users_Lady_GaGa)

2928

# groupby

In [41]:
artist_groupby = song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')})

In [49]:
artist_groupby.show()

In [50]:
artist_groupby.sort('total_count', ascending=False)

artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387
Justin Bieber,29715
Alliance Ethnik,26689
OneRepublic,25754
Train,25402
The Black Keys,22184


In [51]:
artist_groupby.sort('total_count', ascending=True)

artist,total_count
William Tabbert,14
Reel Feelings,24
Beyoncé feat. Bun B and Slim Thug ...,26
Boggle Karaoke,30
Diplo,30
harvey summers,31
Nâdiya,36
Jody Bernal,38
Aneta Langerova,38
Kanye West / Talib Kweli / Q-Tip / Common / ...,38


In [83]:
artist_groupby[artist_groupby['artist'] == 'Kanye West'].head()

artist,total_count
Kanye West,9992


In [84]:
artist_groupby[artist_groupby['artist'] == 'Taylor Swift'].head()

artist,total_count
Taylor Swift,19376


In [85]:
artist_groupby[artist_groupby['artist'] == 'Foo Fighters'].head()

artist,total_count
Foo Fighters,9504


In [86]:
artist_groupby[artist_groupby['artist'] == 'Lady GaGa'].head()

artist,total_count
Lady GaGa,12224


## Count number of unique users in the dataset

# Create a song recommender

In [52]:
train_data,test_data = song_data.random_split(.8,seed=0)

##Simple popularity-based recommender

In [53]:
popularity_model = graphlab.popularity_recommender.create(train_data,
                                                         user_id='user_id',
                                                         item_id='song')

PROGRESS: Recsys training: model = popularity
PROGRESS:     To use one of these as a target column, set target = <column_name>
PROGRESS:     and use a method that allows the use of a target.
PROGRESS: Preparing data set.
PROGRESS:     Data has 893580 observations with 66085 users and 9952 items.
PROGRESS:     Data prepared in: 0.731567s
PROGRESS: 893580 observations to process; with 9952 unique items.


###Use the popularity model to make some predictions

A popularity model makes the same prediction for all users, so provides no personalization.

In [54]:
popularity_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sehr kosmisch - Harmonia,4754.0,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Undo - Björk,4227.0,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,You're The One - Dwight Yoakam ...,3781.0,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Revelry - Kings Of Leon,3527.0,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Secrets - OneRepublic,3148.0,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Hey_ Soul Sister - Train,2538.0,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Tive Sim - Cartola,2521.0,10


## Build a song recommender with personalization

We now create a model that allows us to make personalized recommendations to each user. 

In [55]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
                                                                user_id='user_id',
                                                                item_id='song')

PROGRESS: Recsys training: model = item_similarity
PROGRESS:     To use one of these as a target column, set target = <column_name>
PROGRESS:     and use a method that allows the use of a target.
PROGRESS: Preparing data set.
PROGRESS:     Data has 893580 observations with 66085 users and 9952 items.
PROGRESS:     Data prepared in: 0.757767s
PROGRESS: Computing item similarity statistics:
PROGRESS: Computing most similar items for 9952 items:
PROGRESS: +-----------------+-----------------+
PROGRESS: | Number of items | Elapsed Time    |
PROGRESS: +-----------------+-----------------+
PROGRESS: | 1000            | 1.39556         |
PROGRESS: | 2000            | 1.42954         |
PROGRESS: | 3000            | 1.46382         |
PROGRESS: | 4000            | 1.49692         |
PROGRESS: | 5000            | 1.52993         |
PROGRESS: | 6000            | 1.56179         |
PROGRESS: | 7000            | 1.59472         |
PROGRESS: | 8000            | 1.63277         |
PROGRESS: | 9000         

###Applying the personalized model to make song recommendations

As you can see, different users get different recommendations now.

In [56]:
personalized_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Riot In Cell Block Number Nine - Dr Feelgood ...,0.0375,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sei Lá Mangueira - Elizeth Cardoso ...,0.0331632653061,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,The Stallion - Ween,0.0322580645161,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Rain - Subhumans,0.0314159292035,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,West One (Shine On Me) - The Ruts ...,0.0307390385081,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Back Against The Wall - Cage The Elephant ...,0.0301204819277,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Life Less Frightening - Rise Against ...,0.0284431137725,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,A Beggar On A Beach Of Gold - Mike And The ...,0.0230024907156,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Audience Of One - Rise Against ...,0.0193938442211,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Blame It On The Boogie - The Jacksons ...,0.0190677966102,10


###We can also apply the model to find similar songs to any song in the dataset

In [57]:
personalized_model.get_similar_items(['With Or Without You - U2'])

PROGRESS: Getting similar items completed in 0.009797


song,similar,score,rank
With Or Without You - U2,I Still Haven't Found What I'm Looking For ...,0.0428571428571,1
With Or Without You - U2,Hold Me_ Thrill Me_ Kiss Me_ Kill Me - U2 ...,0.033734939759,2
With Or Without You - U2,Window In The Skies - U2,0.0328358208955,3
With Or Without You - U2,Vertigo - U2,0.0300751879699,4
With Or Without You - U2,Sunday Bloody Sunday - U2,0.0271317829457,5
With Or Without You - U2,Bad - U2,0.0251798561151,6
With Or Without You - U2,A Day Without Me - U2,0.0237154150198,7
With Or Without You - U2,Another Time Another Place - U2 ...,0.020325203252,8
With Or Without You - U2,Walk On - U2,0.020202020202,9
With Or Without You - U2,Get On Your Boots - U2,0.0196850393701,10


In [58]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

PROGRESS: Getting similar items completed in 0.003


song,similar,score,rank
Chan Chan (Live) - Buena Vista Social Club ...,Murmullo - Buena Vista Social Club ...,0.188118811881,1
Chan Chan (Live) - Buena Vista Social Club ...,La Bayamesa - Buena Vista Social Club ...,0.187192118227,2
Chan Chan (Live) - Buena Vista Social Club ...,Amor de Loca Juventud - Buena Vista Social Club ...,0.184834123223,3
Chan Chan (Live) - Buena Vista Social Club ...,Diferente - Gotan Project,0.0214592274678,4
Chan Chan (Live) - Buena Vista Social Club ...,Mistica - Orishas,0.0205761316872,5
Chan Chan (Live) - Buena Vista Social Club ...,Hotel California - Gipsy Kings ...,0.019305019305,6
Chan Chan (Live) - Buena Vista Social Club ...,Nací Orishas - Orishas,0.0191570881226,7
Chan Chan (Live) - Buena Vista Social Club ...,Le Moulin - Yann Tiersen,0.0187969924812,8
Chan Chan (Live) - Buena Vista Social Club ...,Gitana - Willie Colon,0.0187969924812,9
Chan Chan (Live) - Buena Vista Social Club ...,Criminal - Gotan Project,0.018779342723,10


#Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

In [59]:
%matplotlib inline
model_performance = graphlab.recommender.util.compare_models(test_data,
                                                            [popularity_model,personalized_model],
                                                            user_sample=0.05)

compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0
PROGRESS: recommendations finished on 1000/2931 queries. users per second: 17972
PROGRESS: recommendations finished on 2000/2931 queries. users per second: 23228.3

Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0266120777892 | 0.00677255096395 |
|   2    | 0.0255885363357 | 0.0136618366659  |
|   3    | 0.0237689070852 | 0.0191374018907  |
|   4    | 0.0213237802798 |  0.022423171809  |
|   5    | 0.0195837598089 | 0.0247524301772  |
|   6    | 0.0184806095758 | 0.0274010836959  |
|   7    | 0.0175951649851 |  0.03085839158   |
|   8    | 0.0167178437393 | 0.0332739827366  |
|   9    | 0.0156563933432 | 0.0348637336355  |
|   10   | 0.0150801774139 | 0.0382748615348  |
+--------+-----------------+------------------+
[10 rows x 3 columns]





PROGRESS: Evaluate model M1
PROGRESS: recommendations finished on 1000/2931 queries. users per second: 1609.33
PROGRESS: recommendations finished on 2000/2931 queries. users per second: 1648.49

Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    |  0.199931763903 |  0.06392228502  |
|   2    |  0.164790174002 | 0.0988530141407 |
|   3    |  0.14329580348  |  0.123730932954 |
|   4    |  0.129819174343 |  0.145074688331 |
|   5    |  0.11736608666  |  0.161249834879 |
|   6    |  0.107528716024 |  0.176215673847 |
|   7    | 0.0995272213287 |  0.190054760523 |
|   8    | 0.0930996247015 |  0.203053579816 |
|   9    | 0.0882899275939 |  0.215708205508 |
|   10   |  0.084680996247 |  0.227440758656 |
+--------+-----------------+-----------------+
[10 rows x 3 columns]







The curve shows that the personalized model provides much better performance. 

In [75]:
subset_test_users = test_data['user_id'].unique()[0:10000]
subset_testuser_recommend_song = personalized_model.recommend(subset_test_users,k=1)

PROGRESS: recommendations finished on 1000/10000 queries. users per second: 1636.35
PROGRESS: recommendations finished on 2000/10000 queries. users per second: 1676.91
PROGRESS: recommendations finished on 3000/10000 queries. users per second: 1683
PROGRESS: recommendations finished on 4000/10000 queries. users per second: 1698.37
PROGRESS: recommendations finished on 5000/10000 queries. users per second: 1698.67
PROGRESS: recommendations finished on 6000/10000 queries. users per second: 1703.17
PROGRESS: recommendations finished on 7000/10000 queries. users per second: 1698.23
PROGRESS: recommendations finished on 8000/10000 queries. users per second: 1699.81
PROGRESS: recommendations finished on 9000/10000 queries. users per second: 1702.11
PROGRESS: recommendations finished on 10000/10000 queries. users per second: 1700.17


In [76]:
subset_testuser_recommend_song.show()

# get the most recommended song by personization recommender.

In [78]:
most_recommended_song_by_personization_model = subset_testuser_recommend_song.groupby(key_columns='song', operations={'total_count': graphlab.aggregate.COUNT()})

In [80]:
most_recommended_song_by_personization_model.sort('total_count', ascending=False)

song,total_count
Undo - Björk,432
Secrets - OneRepublic,373
Revelry - Kings Of Leon,235
You're The One - Dwight Yoakam ...,163
Fireflies - Charttraxx Karaoke ...,118
Hey_ Soul Sister - Train,106
Horn Concerto No. 4 in E flat K495: II. Romance ...,92
Sehr kosmisch - Harmonia,86
OMG - Usher featuring will.i.am ...,64
The Scientist - Coldplay,47


In [82]:
most_recommended_song_by_personization_model.sort('total_count', ascending=True)

song,total_count
Arco Arena - Cake,1
The Warrior's Code - Dropkick Murphys ...,1
Anything New - Digitalism,1
Fast Forward To 2012 (Album Version) - A Day ...,1
Wish You Were Here - Incubus ...,1
Change - Blind Melon,1
Get:On - Moguai,1
Dream About Flying - Alexi Murdoch ...,1
Get Me Away From Here_ I'm Dying - Belle & ...,1
Elysium - Portishead,1
