In [1]:
import graphlab as gl

In [2]:
data = gl.SFrame('song_data.gl')

This non-commercial license of GraphLab Create for academic use is assigned to eric.leung@alumni.utoronto.ca and will expire on September 03, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1475427503.log


In [3]:
data.head(3)

user_id,song_id,listen_count,title,artist
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBXHDL12A81C204C0,1,Stronger,Kanye West

song
The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia ...
Stronger - Kanye West


## Counting unique users who have listened to songs by various artists

### Compute the number of unique users for each of these artists: 'Kanye West', 'Foo Fighters', 'Taylor Swift' and 'Lady GaGa'

In [4]:
kanye_entries = data[data['artist'] == 'Kanye West']

In [7]:
len(kanye_entries['user_id'].unique())

2522

In [8]:
foofighters_entries = data[data['artist'] == 'Foo Fighters']
len(foofighters_entries.unique())

3429

In [9]:
taylorswift_entries = data[data['artist'] == 'Taylor Swift']
len(taylorswift_entries.unique())

6227

In [10]:
ladygaga_entries = data[data['artist'] == 'Lady GaGa']
len(ladygaga_entries.unique())

4129

## Most popular and least popular artist using groupby

In [13]:
listen_count = data.groupby(key_columns='artist', operations={'total_count': gl.aggregate.SUM('listen_count')})

In [14]:
listen_count.head(3)

artist,total_count
The Dells,274
Lil Jon / The East Side Boyz ...,197
Tom Petty And The Heartbreakers ...,2867


In [17]:
listen_count = listen_count.sort('total_count', ascending=False)

In [18]:
listen_count.head(5)

artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387


In [19]:
# Most popular
listen_count[0]

{'artist': 'Kings Of Leon', 'total_count': 43218}

In [20]:
# Least popular
listen_count[-1]

{'artist': 'William Tabbert', 'total_count': 14}

## Create similiarity recommender

In [22]:
# 80% training, 20% testing
train_data,test_data = data.random_split(.8,seed=0)

In [25]:
# train the model
similarity_model = gl.item_similarity_recommender.create(train_data,
                                                                user_id='user_id',
                                                                item_id='song')

## Make recommendations for the first 10,000 users

In [26]:
subset_test_users = test_data['user_id'].unique()[0:10000]

### Compute one recommended song for each of these test users

In [27]:
recommendations = similarity_model.recommend(subset_test_users,k=1)

In [29]:
recommendations.head(5)

user_id,song,score,rank
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Cuando Pase El Temblor - Soda Stereo ...,0.0194504536115,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Grind With Me (Explicit Version) - Pretty Ricky ...,0.0459424376488,1
f6c596a519698c97f1591ad89 f540d76f6a04f1a ...,Hey_ Soul Sister - Train,0.0238929539919,1
696787172dd3f5169dc94deef 97e427cee86147d ...,Senza Una Donna (Without A Woman) - Zucchero / ...,0.017026577677,1
3a7111f4cdf3c5a85fd4053e3 cc2333562e1e0cb ...,Heartbreak Warfare - John Mayer ...,0.0298416515191,1


## Use groupby to find most recommended song

In [30]:
# use the song titles as the key to the aggregator
# use count() aggregator and store result in 'count'
recommended_counts = recommendations.groupby(key_columns='song', 
                                             operations={'count': gl.aggregate.COUNT()})

In [32]:
recommended_counts.sort('count', ascending=False)

song,count
Undo - Björk,438
Secrets - OneRepublic,386
Revelry - Kings Of Leon,226
You're The One - Dwight Yoakam ...,162
Fireflies - Charttraxx Karaoke ...,117
Sehr kosmisch - Harmonia,99
Horn Concerto No. 4 in E flat K495: II. Romance ...,96
Hey_ Soul Sister - Train,94
OMG - Usher featuring will.i.am ...,59
Dog Days Are Over (Radio Edit) - Florence + The ...,46
