In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import fbeta_score , make_scorer
from sklearn.model_selection import ParameterGrid# Create the parameter grid based on the results of random search 


In [3]:
plays = pd.read_csv('../data/user_artists.dat', sep='\t')
artists = pd.read_csv('../data/artists.dat', sep='\t', usecols=['id','name'])

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})

# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
print(artist_rank)

                    totalUsers  totalPlays     avgPlays
name                                                   
Britney Spears             522     2393140  4584.559387
Depeche Mode               282     1301308  4614.567376
Lady Gaga                  611     1291387  2113.563011
Christina Aguilera         407     1058405  2600.503686
Paramore                   399      963449  2414.659148
...                        ...         ...          ...
Morris                       1           1     1.000000
Eddie Kendricks              1           1     1.000000
Excess Pressure              1           1     1.000000
My Mine                      1           1     1.000000
A.M. Architect               1           1     1.000000

[17632 rows x 3 columns]


In [4]:
# Merge into ap matrix
ap = ap.join(artist_rank, on="name", how="inner") \
    .sort_values(['playCount'], ascending=False)

# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)
#print(ap)

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values

# Show sparsity
sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100
print("sparsity: %.2f" % sparsity)

sparsity: 0.28


In [5]:
from scipy.sparse import csr_matrix

# Build a sparse matrix
X = csr_matrix(ratings)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

rating matrix shape (1892, 17632)


In [6]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset

# Build data references + train test
Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
train, test = random_train_test_split(interactions)

# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

# To be completed...

In [7]:
# Train
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f1305d7bdc0>

In [8]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
res_warp = {'name':'warp','train_precision':train_precision,'test_precision':test_precision,'train_auc':test_auc,'test_auc':test_auc}


Precision: train 0.37, test 0.13.
AUC: train 0.96, test 0.85.


In [9]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Depeche Mode' 'The Beatles' 'Coldplay' ... 'Robert Caldeira Jr'
 'The Beat Daddys' 'Outbreak']


In [10]:
model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f12da812f40>

In [11]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
res_bpr = {'name':'bpr','train_precision':train_precision,'test_precision':test_precision,'train_auc':test_auc,'test_auc':test_auc}

Precision: train 0.36, test 0.12.
AUC: train 0.85, test 0.78.


In [12]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Radiohead' 'Depeche Mode' 'New Order' ... 'Katy Perry' 'Ke$ha' 'Rihanna']


In [13]:
model = LightFM(learning_rate=0.05, loss='logistic')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f12da812e20>

In [14]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
res_logistic = {'name':'logistic','train_precision':train_precision,'test_precision':test_precision,'train_auc':test_auc,'test_auc':test_auc}

Precision: train 0.20, test 0.07.
AUC: train 0.89, test 0.81.


In [15]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Lady Gaga' 'Britney Spears' 'Rihanna' ... 'Jennifer Rostock'
 'Elvis Crespo' 'Sad Lovers and Giants']


In [16]:
model = LightFM(learning_rate=0.05, loss='warp-kos')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f12da4a7d00>

In [17]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
res_warp_kos = {'name':'warp_kos','train_precision':train_precision,'test_precision':test_precision,'train_auc':test_auc,'test_auc':test_auc}

Precision: train 0.35, test 0.13.
AUC: train 0.89, test 0.82.


In [18]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Depeche Mode' 'Pet Shop Boys' 'Moby' ... 'Nokturnal Mortum' 'Luctus'
 'Amžius']


In [19]:
print("res_warp : ",res_warp)
print("res_bpr : ",res_bpr)
print("res_warp_kos : ",res_warp_kos)
data = [res_warp,res_bpr,res_warp_kos]

res_dataframe = pd.DataFrame(data = data , columns=['name','train_precision','test_precision','train_auc','test_auc'])
res_dataframe

res_warp :  {'name': 'warp', 'train_precision': 0.37208685, 'test_precision': 0.12747194, 'train_auc': 0.852904, 'test_auc': 0.852904}
res_bpr :  {'name': 'bpr', 'train_precision': 0.359375, 'test_precision': 0.123196155, 'train_auc': 0.7837616, 'test_auc': 0.7837616}
res_warp_kos :  {'name': 'warp_kos', 'train_precision': 0.34724575, 'test_precision': 0.12677713, 'train_auc': 0.82247126, 'test_auc': 0.82247126}


Unnamed: 0,name,train_precision,test_precision,train_auc,test_auc
0,warp,0.372087,0.127472,0.852904,0.852904
1,bpr,0.359375,0.123196,0.783762,0.783762
2,warp_kos,0.347246,0.126777,0.822471,0.822471


In [19]:
df = pd.DataFrame(columns=["WRAP", "LOGISTIC", "BRP" ,"KOS-WRAP" ])

loss = ['warp', 'logistic', 'bpr', 'warp-kos' ]

for i,j in enumerate(loss):

    model = LightFM(learning_rate=0.05, loss= j )
    model.fit(train, epochs=10, num_threads=2)
    a = precision_at_k(model, train, k=5).mean()
    b = precision_at_k(model, test, k=5, train_interactions=train).mean()
    c = auc_score(model, train).mean()
    d = auc_score(model, test, train_interactions=train).mean()

    this_column = df.columns[i]
    df[this_column] = [a,b,c,d]

print(df)

       WRAP  LOGISTIC       BRP  KOS-WRAP
0  0.436055  0.219512  0.422906  0.381972
1  0.171444  0.086417  0.158075  0.157647
2  0.965172  0.887431  0.840421  0.888529
3  0.858710  0.808861  0.770340  0.823018


In [24]:
sparse_ratings = csr_matrix(ratings) ### sparsing the matrix

svd = TruncatedSVD(n_components=2000,n_iter=15,random_state=8)
sparse_matrix_svd = svd.fit_transform(sparse_ratings)

In [63]:
def get_recommendations_svd(userID,n_artists=30):
    # Get the index of the movie that matches the title
    
    cosine_sim = linear_kernel(sparse_matrix_svd, sparse_matrix_svd[userID].reshape(1,-1))
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n_artists + 1]
    
    # Get the movie indices
    users_indices = [i[0] for i in sim_scores]
    artist_top = []
    for i in users_indices:
        idx_i = np.argmax(ratings[i]) # take the most important artist for this user
        if ap['name'][ap['artistID']==idx_i].unique()[0] not in artist_top:
            artist_top.append(ap['name'][ap['artistID']==idx_i].unique()[0])
    
    # Return the top 10 most similar movies
    return artist_top

In [64]:
artist_top = get_recommendations_svd(0)
artist_top

['Mindless Self Indulgence',
 'Faithless',
 'Gothminister',
 'Reaper',
 'Funeral for a Friend',
 'James Blunt',
 'Talk Talk',
 'Cock Robin',
 'Thievery Corporation',
 'Ana Carolina',
 'Zornik',
 'Dope']

In [65]:
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items[0:10])

['a-ha' 'Orchestral Manoeuvres in the Dark' 'Duran Duran' 'Depeche Mode'
 'Robbie Williams' 'The Human League' 'Ultravox' 'Yazoo' 'George Michael'
 'Japan']


In [68]:
# Optimisation de paramètres avec GridSearch

from sklearn.model_selection import ParameterGrid# Create the parameter grid based on the results of random search 

param_grid = {
    'learning_rate': [0.05 , 0.08],
    'learning_schedule':['adagrad','adadelta'],
    'loss': ['warp','bpr','logistic','warp-kos']
    
}
# definition liste score auc
auc_score_values = []

for grid in ParameterGrid(param_grid):
    model = LightFM(**grid)
    pred = model.fit(train)
    auc_score_values.append(round(auc_score(model, test, train_interactions=train).mean(),3))
    
max_value = max(auc_score_values) 
max_index = np.argmax(auc_score_values)
ParameterGrid(param_grid)[max_index ].items()

dict_items([('loss', 'logistic'), ('learning_schedule', 'adadelta'), ('learning_rate', 0.05)])