In [128]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix
import pickle

## Import data

In [2]:
user_profiles = pd.read_table("/home/dennis/Documents/lastfm-prediction/lastfm-dataset-360K/profiles.tsv", 
                         header=None, names=['user_id', 'gender', 'age', 'country', 'signup'])
print(user_profiles.head(5))
print(len(user_profiles[user_profiles['country'] == 'Germany']))

                                    user_id gender   age        country  \
0  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0        Germany   
1  00001411dc427966b17297bf4d69e7e193135d89      f   NaN         Canada   
2  00004d2ac9316e22dc007ab2243d6fcb239e707d    NaN   NaN        Germany   
3  000063d3fe1cf2ba248b9e3c3f0334845a27a6bf      m  19.0         Mexico   
4  00007a47085b9aab8af55f52ec8846ac479ac4fe      m  28.0  United States   

         signup  
0   Feb 1, 2007  
1   Dec 4, 2007  
2   Sep 1, 2006  
3  Apr 28, 2008  
4  Jan 27, 2006  
31651


In [3]:
user_data = pd.read_table("/home/dennis/Documents/lastfm-prediction/lastfm-dataset-360K/plays.tsv", 
                         header=None, names=['user_id', 'artist_id', 'artist_name', 'plays'])
user_data.drop('artist_id', axis=1, inplace=True)
if user_data['artist_name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist_name'])

## Only German Data
To reduce the size of the data, only german users will be considered

In [12]:
german_profiles = user_profiles[user_profiles['country'] == 'Germany']
user_data_ger_profiles = german_profiles.merge(user_data, on="user_id", how='left')
print(user_data_ger_profiles.head(5))
print(len(user_data_ger_profiles))

                                    user_id gender   age  country  \
0  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   
1  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   
2  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   
3  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   
4  00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   

        signup           artist_name   plays  
0  Feb 1, 2007       betty blowtorch  2137.0  
1  Feb 1, 2007             die Ärzte  1099.0  
2  Feb 1, 2007     melissa etheridge   897.0  
3  Feb 1, 2007             elvenking   717.0  
4  Feb 1, 2007  juliette & the licks   706.0  
1555720


In [14]:
artist_plays = user_data_ger_profiles.groupby('artist_name')['plays'].sum().reset_index()
artist_plays.columns = ['artist_name', 'artist_total_plays']
print(artist_plays.head(5))


     artist_name  artist_total_plays
0            !!!             14362.0
1  !action pact!                85.0
2          !cube                40.0
3       !deladap              1148.0
4       !distain               379.0


## Reduce number of artists
Only the most popular artsists will be considered for the recommender in order to avoid distorted data and to reduce the total size of the set.

In [39]:
print(artist_plays['artist_total_plays'].quantile(np.arange(.9, 1, .01)))
threshold = 90000
popular_artist_plays = artist_plays[artist_plays['artist_total_plays'] > threshold]

print(len(artist_plays))
print(len(popular_artist_plays['artist_name']))

0.90     2956.00
0.91     3544.00
0.92     4271.00
0.93     5291.95
0.94     6717.10
0.95     8752.25
0.96    11937.20
0.97    17719.80
0.98    29364.40
0.99    69368.70
Name: artist_total_plays, dtype: float64
82816
645


In [42]:
user_with_artist_plays = user_data_ger_profiles.merge(popular_artist_plays, on='artist_name', how='inner')
user_with_artist_plays = user_with_artist_plays.sort_values('artist_total_plays', ascending=False)
print(user_with_artist_plays.head(5))

                                       user_id gender   age  country  \
0     00000c289a1829a808ac09c00daf10bc3c4e223b      f  22.0  Germany   
4507  aba29c45c5067cba15e191da456a130ed84bcb14      f  26.0  Germany   
4479  aadcd8781ea372f3164b726ff10011d4ac73b9cc      m  32.0  Germany   
4478  aad472ed3b7ca0df1e4efc7c9b2436f52e221519      m  18.0  Germany   
4477  aad0cddae6587e92c7069b22d202adb99d53624e      f  15.0  Germany   

            signup artist_name   plays  artist_total_plays  
0      Feb 1, 2007   die Ärzte  1099.0           2955844.0  
4507   Sep 6, 2007   die Ärzte  2245.0           2955844.0  
4479  Nov 27, 2006   die Ärzte   802.0           2955844.0  
4478  Nov 24, 2006   die Ärzte   498.0           2955844.0  
4477  Oct 25, 2007   die Ärzte   648.0           2955844.0  


In [89]:
user_with_artist_plays = user_with_artist_plays.drop_duplicates(['user_id', 'artist_name'])
print(len(user_with_artist_plays))
wide_artist_data = user_with_artist_plays.pivot(index = 'artist_name', columns = 'user_id', values = 'artist_total_plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)


647469


## Train model

In [126]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'auto')
model_knn.fit(wide_artist_data_sparse)
#model_knn.fit(wide_artist_data_sparse)


NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

## Make recommendations

In [127]:
query_index = np.random.choice(wide_artist_data.shape[0])
print(query_index)
#print(wide_artist_data.iloc[query_index, :])
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

print(distances.flatten())
distance_len = len(distances.flatten())
for i in range(1, min(distance_len, distance_len + 1)):
    if i == 1:
        print("Artist: %s" % wide_artist_data.index[query_index])
    print("Recommendation %s: %s - %s " % (i, wide_artist_data.index[indices[0][i]], distances.flatten()[i]))

480
[7.68274333e-14 6.35549329e-01 6.76927353e-01 6.78346854e-01
 6.79643963e-01 6.81157071e-01]
Artist: red hot chili peppers
Recommendation 1: die Ärzte - 0.6355493293879839 
Recommendation 2: nirvana - 0.6769273529191766 
Recommendation 3: system of a down - 0.678346854034508 
Recommendation 4: foo fighters - 0.6796439625801229 
Recommendation 5: beatsteaks - 0.6811570706879895 


## Save model

In [134]:
pickle.dump(model_knn, open('nn_recommender.sav', 'wb'))

In [135]:
os.listdir()

['lastfm-dataset-360K',
 '.ipynb_checkpoints',
 'lastfm-dataset-360K.tar.gz',
 'LastFM-Exploration.ipynb',
 'nn_recommender',
 'nn_recommender.sav']

In [143]:
nn_loaded = pickle.load(open('nn_recommender.sav', 'rb'))
distances, indices = nn_loaded.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
print(distances.flatten())

[7.68274333e-14 6.35549329e-01 6.76927353e-01 6.78346854e-01
 6.79643963e-01 6.81157071e-01]


## Save available artist-names

In [159]:
artists = user_with_artist_plays['artist_name'].unique()
artists = pd.DataFrame(artists)
artists.to_csv("artists.csv")

In [160]:
os.listdir()

['k.csv',
 'lastfm-dataset-360K',
 '.ipynb_checkpoints',
 'artists.csv',
 'lastfm-dataset-360K.tar.gz',
 'LastFM-Exploration.ipynb',
 'nn_recommender',
 'nn_recommender.sav',
 'artists']