In [1]:
from recommenders.datasets import movielens

In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
import seaborn as sns
import implicit
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold, train_test_split
from implicit.evaluation import train_test_split, mean_average_precision_at_k, precision_at_k
from lightfm.evaluation import precision_at_k as precision_at_k_light, recall_at_k as recall_at_k_light, auc_score, reciprocal_rank
from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
from sklearn.model_selection import ParameterGrid

In [4]:
import scipy.sparse as sparse

In [7]:
data = movielens.load_pandas_df(
    size='100k',
    genres_col='genre',
    header=["userID", "itemID", "rating"]
)
# quick look at the data
data.sample(5)

100%|██████████| 4.81k/4.81k [00:00<00:00, 7.19kKB/s]


Unnamed: 0,userID,itemID,rating,genre
76391,669,300,4.0,Action|Thriller
13304,622,111,4.0,Comedy|Romance
48350,795,47,3.0,Comedy|Drama
277,853,302,4.0,Crime|Film-Noir|Mystery|Thriller
45080,268,71,3.0,Animation|Children's|Musical


In [7]:
data.head()

Unnamed: 0,userID,itemID,rating,genre
0,196,242,3.0,Comedy
1,63,242,3.0,Comedy
2,226,242,5.0,Comedy
3,154,242,3.0,Comedy
4,306,242,5.0,Comedy


In [15]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [16]:
def encode_df(anime_df):
    """Encodes rating data with continuous user and anime ids"""
    
    anime_ids, anime_df['itemID'], num_anime = encode_column(anime_df['itemID'])
    user_ids, anime_df['userID'], num_users = encode_column(anime_df['userID'])
    return anime_df, num_users, num_anime, user_ids, anime_ids

In [17]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(data)

In [18]:
sparse_item_user = sparse.csr_matrix((anime_df['rating'].values,(anime_df['userID'].values, anime_df['itemID'].values)),shape=(num_users, num_anime))

In [19]:
sparse_user_item = sparse.csr_matrix((anime_df['rating'].values,(anime_df['itemID'].values, anime_df['userID'].values)),shape=(num_anime, num_users))

In [20]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

In [21]:
alpha_val = 40
data_conf = (sparse_item_user * alpha_val).astype('double')

In [22]:
train_matrix, test_matrix = train_test_split(data_conf)

In [23]:
model.fit(train_matrix)

  0%|          | 0/20 [00:00<?, ?it/s]

In [28]:
from implicit.evaluation import ndcg_at_k

In [25]:
ndcg_at_k(model, train_matrix, test_matrix)

  0%|          | 0/940 [00:00<?, ?it/s]

0.15766334006337054

In [29]:
from implicit.evaluation import mean_average_precision_at_k

In [27]:
mean_average_precision_at_k(model, train_matrix, test_matrix)

  0%|          | 0/940 [00:00<?, ?it/s]

0.07085375238550905

In [30]:
from implicit.evaluation import precision_at_k

In [29]:
precision_at_k(model, train_matrix, test_matrix)

  0%|          | 0/940 [00:00<?, ?it/s]

0.16437492115554433

In [31]:
import surprise

In [32]:
from recommenders.datasets.python_splitters import python_random_split
import surprise

In [33]:
train, test = python_random_split(data[['userID', 'itemID', 'rating']], 0.75)

In [34]:
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

In [38]:
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7f1435520310>

In [None]:
trainset, testset = train_test_split(data, test_size=.25)

In [40]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

In [41]:
svd.fit(train_set)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f14355b2580>

In [43]:
predictions = predict(svd, test, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,181,646,3.442847
1,354,709,3.044291
2,477,118,4.49529
3,589,655,3.663062
4,605,1016,3.816549


In [44]:
all_predictions = compute_ranking_predictions(svd, train, remove_seen=True)

In [45]:
k = 10 
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)

In [46]:
k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.014182
NDCG:	0.105142
Precision@K:	0.097879
Recall@K:	0.034634


In [47]:
svdpp = surprise.SVDpp(random_state=0, n_factors=20, n_epochs=10, verbose=True)

In [48]:
svdpp.fit(train_set)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f14368b4190>

In [49]:
all_predictions_svdpp = compute_ranking_predictions(svdpp, train, remove_seen=True)

In [51]:
k = 10
eval_map_pp = map_at_k(test, all_predictions_svdpp,col_prediction='prediction', k=k)
eval_ndcg_pp = ndcg_at_k(test, all_predictions_svdpp, col_prediction='prediction', k=k)
eval_precision_pp = precision_at_k(test, all_predictions_svdpp, col_prediction='prediction', k=k)
eval_recall_pp = recall_at_k(test, all_predictions_svdpp , col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map_pp,
      "NDCG:\t%f" % eval_ndcg_pp,
      "Precision@K:\t%f" % eval_precision_pp,
      "Recall@K:\t%f" % eval_recall_pp, sep='\n')

MAP:	0.012679
NDCG:	0.099899
Precision@K:	0.094592
Recall@K:	0.031989


In [24]:
import implicit

In [25]:
model2 = implicit.bpr.BayesianPersonalizedRanking(factors = 20, regularization=0.1, iterations=20)

In [7]:
from implicit.evaluation import train_test_split, mean_average_precision_at_k, precision_at_k

In [27]:
model2.fit(train_matrix)

  0%|          | 0/20 [00:00<?, ?it/s]

In [31]:
ndcg_at_k(model2, train_matrix, test_matrix)

  0%|          | 0/942 [00:00<?, ?it/s]

0.20335337125966102

In [32]:
mean_average_precision_at_k(model2, train_matrix, test_matrix)

  0%|          | 0/942 [00:00<?, ?it/s]

0.11001686069961512

In [33]:
precision_at_k(model2, train_matrix, test_matrix)

  0%|          | 0/942 [00:00<?, ?it/s]

0.21178102926337034