In [2]:
import pandas as pd
import numpy as np

In [76]:
from surprise import Reader, Dataset, Trainset, accuracy, SVD, SVDpp

from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, \
                                            KNNBaseline, SVDpp, SlopeOne, \
                                            NMF, NormalPredictor, KNNWithZScore, \
                                            BaselineOnly, CoClustering, SVD
from surprise.model_selection import cross_validate, train_test_split, \
                                    GridSearchCV, RandomizedSearchCV, KFold

In [3]:
reviews_df = pd.read_csv('../data/small_dataset/reviews.csv')

In [72]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reviews_df[[ 'UserID', 'ISBN', 'Rating']], reader)

In [73]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), \
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), \
                  BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.986131,0.163936,0.007644
KNNBaseline,0.989089,0.173621,0.026491
SVDpp,0.989337,0.289333,0.009395
BaselineOnly,0.990057,0.007375,0.007544
KNNBasic,1.034947,0.12497,0.014594
CoClustering,1.045563,0.305577,0.006406
KNNWithZScore,1.046234,0.208522,0.015774
KNNWithMeans,1.048405,0.15,0.01558
SlopeOne,1.05566,0.023571,0.00819
NMF,1.062024,0.288634,0.007501


In [91]:
reviews_df

Unnamed: 0,ISBN,UserID,Rating
20,802714625,7241,5.0
21,802714625,835,4.0
23,802714625,1502,4.0
26,802714625,3855,4.0
28,802714625,8156,5.0
...,...,...,...
26,399184414,3267,2.0
28,399184414,6287,4.0
38,399184414,6536,2.0
39,399184414,11584,5.0


In [112]:
trainset, testset = train_test_split(data, test_size=0.2)

In [115]:
algorithm = SVD
param_grid = {'n_factors':[50,100,150], 'n_epochs':[20, 30, 50]} #
gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9875596805026805
{'n_factors': 150, 'n_epochs': 30}


In [None]:
gs.cv_results

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

In [117]:
algorythm = SVD
param_grid = {'n_epochs': [20], 'n_factors':[100]}
gs = GridSearchCV(algorythm, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9890153771571732
{'n_epochs': 20, 'n_factors': 100}


In [118]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), \
                  BaselineOnly()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.985841,0.008486,0.020324
SVD,0.987952,0.175878,0.008394


In [126]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(random_state=42), \
                  BaselineOnly()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'],verbose=False, cv=None)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.986511,0.011768,0.004819
SVD,0.987126,0.194321,0.004853


In [125]:
alg = SVD()

In [None]:
from tqdm import tqdm
tqdm(range(500))

In [141]:
benchmark = []
from collections import Counter
# Iterate over all algorithms
res = []
for i in range(200):
    for algorithm in [SVD(), SVD(biased=False),BaselineOnly()]:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False,n_jobs=1)
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)
    res.append(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse'))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed