# LastFM: Music Recommendation

Music Recommendation Datasets for Research:

- Last.fm Dataset - 360K users (user top artists)
- Last.fm Dataset - 1K users (user full listening history)

http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/
https://beckernick.github.io/music_recommender/

In [13]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [14]:
user_data = pd.read_table('/home/jovyan/work/lastfm/lastfm-dataset-360K/usersha1-artmbid-artname-plays-1m.tsv',
                          header = None,
                          names = ['users', 'artist_id', 'artist_name', 'plays'],
                          usecols = ['users', 'artist_name', 'plays'])
user_profiles = pd.read_table('/home/jovyan/work/lastfm/lastfm-dataset-360K/usersha1-profile.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [15]:
def describe_user_data(user_data):
    print('Shape:', user_data.shape)
    print('Unique users:', user_data.users.unique().size)
    print('Unique artists:', user_data.artist_name.unique().size)

In [16]:
describe_user_data(user_data)
user_data.head()

Shape: (1000000, 3)
Unique users: 20465
Unique artists: 80410


Unnamed: 0,users,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [17]:
if user_data['artist_name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist_name'])
    
artist_plays = (user_data \
                    .groupby(by = ['artist_name'])['plays'] \
                    .sum() \
                    .reset_index() \
                    .rename(columns = {'plays': 'total_artist_plays'})[['artist_name', 'total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist_name,total_artist_plays
0,cours de la somme,9
1,oliver shanti & friends,3
2,!!!,19596
3,!5:b>@ 3070,33
4,!action pact!,143


In [18]:
user_data_with_artist_plays = user_data.merge(artist_plays, left_on = 'artist_name', right_on = 'artist_name', how = 'left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist_name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,4242
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,180530
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,11425
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,20182
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,7042


In [19]:
artist_plays['total_artist_plays'].describe()

count    8.040900e+04
mean     2.693809e+03
std      2.029754e+04
min      1.000000e+00
25%      4.800000e+01
50%      1.790000e+02
75%      7.160000e+02
max      1.803365e+06
Name: total_artist_plays, dtype: float64

In [40]:
q = artist_plays['total_artist_plays'].quantile(np.arange(.95, 1, .01))
popularity_threshold = q.iloc[0]
print('Popularity Threshold:', popularity_threshold)
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
describe_user_data(user_data_popular_artists)

Popularity Threshold: 7737.6
Shape: (684354, 4)
Unique users: 20439
Unique artists: 4021


In [34]:
user_data_popular_artists.head()

Unnamed: 0,users,artist_name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,180530
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,11425
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,20182
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,755358
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507,55422


In [41]:
# Drop duplicates: users - artist_name

print(user_data_popular_artists.shape)
user_data_popular_artists = user_data_popular_artists.drop_duplicates(['users', 'artist_name'])
print(user_data_popular_artists.shape)

(684354, 4)
(684352, 4)


## KNN

In [45]:
wide_artist_data = user_data_popular_artists.pivot(index = 'artist_name', columns = 'users', values = 'plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)
q
wide_artist_data.shape

(4021, 20439)

In [43]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(wide_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [50]:
artist = 'red hot chili peppers'
distances, indices = model_knn.kneighbors(wide_artist_data.loc[artist].values.reshape(1, -1), n_neighbors = 10)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for "{0}":'.format(artist))
    else:
        print('{0}: "{1}", distance {2}'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for "red hot chili peppers":
1: "muse", distance 0.7369592793207145
2: "john frusciante", distance 0.7645946561665107
3: "delirious?", distance 0.7877275267756514
4: "foo fighters", distance 0.7978945280271925
5: "coldplay", distance 0.8143969813426645
6: "jimi hendrix", distance 0.8180878163801883
7: "metallica", distance 0.8304035219515764
8: "audioslave", distance 0.8347777746095514
9: "the killers", distance 0.8357133863789841
