In [5]:
import pandas as pd
import numpy as np

In [108]:
from surprise import Reader, Dataset, Trainset, accuracy, SVD, SVDpp

from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, \
                                            KNNBaseline, SVDpp, SlopeOne, \
                                            NMF, NormalPredictor, KNNWithZScore, \
                                            BaselineOnly, CoClustering, SVD
from surprise.model_selection import cross_validate, train_test_split, \
                                    GridSearchCV, RandomizedSearchCV, KFold, \
                                    PredefinedKFold, KFold
from sklearn.metrics import mean_squared_error

### Small subset - pipeline 

In [3]:
reviews_df = pd.read_csv('../data/small_dataset/reviews.csv')

In [72]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reviews_df[[ 'UserID', 'ISBN', 'Rating']], reader)

In [73]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), \
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), \
                  BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.986131,0.163936,0.007644
KNNBaseline,0.989089,0.173621,0.026491
SVDpp,0.989337,0.289333,0.009395
BaselineOnly,0.990057,0.007375,0.007544
KNNBasic,1.034947,0.12497,0.014594
CoClustering,1.045563,0.305577,0.006406
KNNWithZScore,1.046234,0.208522,0.015774
KNNWithMeans,1.048405,0.15,0.01558
SlopeOne,1.05566,0.023571,0.00819
NMF,1.062024,0.288634,0.007501


In [91]:
reviews_df

Unnamed: 0,ISBN,UserID,Rating
20,802714625,7241,5.0
21,802714625,835,4.0
23,802714625,1502,4.0
26,802714625,3855,4.0
28,802714625,8156,5.0
...,...,...,...
26,399184414,3267,2.0
28,399184414,6287,4.0
38,399184414,6536,2.0
39,399184414,11584,5.0


In [112]:
trainset, testset = train_test_split(data, test_size=0.2)

In [115]:
algorithm = SVD
param_grid = {'n_factors':[50,100,150], 'n_epochs':[20, 30, 50]} #
gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9875596805026805
{'n_factors': 150, 'n_epochs': 30}


In [None]:
gs.cv_results

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

In [117]:
algorythm = SVD
param_grid = {'n_epochs': [20], 'n_factors':[100]}
gs = GridSearchCV(algorythm, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9890153771571732
{'n_epochs': 20, 'n_factors': 100}


In [118]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), \
                  BaselineOnly()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.985841,0.008486,0.020324
SVD,0.987952,0.175878,0.008394


In [126]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(random_state=42), \
                  BaselineOnly()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'],verbose=False, cv=None)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.986511,0.011768,0.004819
SVD,0.987126,0.194321,0.004853


In [125]:
alg = SVD()

In [None]:
from tqdm import tqdm
tqdm(range(500))

In [141]:
benchmark = []
from collections import Counter
# Iterate over all algorithms
res = []
for i in range(200):
    for algorithm in [SVD(), SVD(biased=False),BaselineOnly()]:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False,n_jobs=1)
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)
    res.append(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse'))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

# Collaborative filtering with big_subset_temp

In [19]:
rated_df = pd.read_csv('../data/big_data_temp/rated_subset.csv')
rated_df = rated_df.drop(columns=['Unnamed: 0'])
rated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15964 entries, 0 to 15963
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UserID  15964 non-null  int64  
 1   ISBN    15964 non-null  object 
 2   Rating  15964 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 374.3+ KB


In [20]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rated_df[[ 'UserID', 'ISBN', 'Rating']], reader)

In [21]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), \
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), \
                  BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.429138,0.188632,0.910873
NMF,0.431252,0.702704,0.039523
SVD,0.433805,0.549834,0.032656
SlopeOne,0.43793,0.027318,0.054716
KNNBasic,0.453964,0.15667,0.742809
SVDpp,0.478705,1.644053,0.091906
KNNWithZScore,0.480272,0.223333,0.834798
KNNWithMeans,0.482895,0.194819,0.789969
CoClustering,0.586723,0.393034,0.026035
BaselineOnly,0.845195,0.014169,0.037108


In [23]:
rated_df.head()

Unnamed: 0,UserID,ISBN,Rating
0,11051,1101984597,5.0
1,11051,1984819194,5.0
2,11051,1771642483,4.0
3,11051,1473637465,1.0
4,11051,076790818X,5.0


In [112]:
trainset, testset = train_test_split(data, test_size=0.2)

In [26]:
algorithm = SVD
param_grid = {'n_factors':[50,100,150], 'n_epochs':[20, 30, 50], 'reg_all':[0.02, 0.05]} #
gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.3849518192567487
{'n_factors': 150, 'n_epochs': 50, 'reg_all': 0.02}


In [27]:
algorithm = SVD
param_grid = {'n_factors':[150, 200, 250], 'n_epochs':[50, 100], 'reg_all':[0.02]} #
gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.38399501241434547
{'n_factors': 250, 'n_epochs': 100, 'reg_all': 0.02}


In [28]:
gs.cv_results

{'split0_test_rmse': array([0.38168126, 0.37803144, 0.37957745, 0.37603797, 0.38073918,
        0.37769602]),
 'split1_test_rmse': array([0.39697538, 0.39386702, 0.39900305, 0.39399337, 0.39773791,
        0.3930048 ]),
 'split2_test_rmse': array([0.38717092, 0.38034833, 0.38595682, 0.3819897 , 0.38486393,
        0.38128421]),
 'mean_test_rmse': array([0.38860919, 0.38408227, 0.38817911, 0.38400701, 0.38778034,
        0.38399501]),
 'std_test_rmse': array([0.00632608, 0.00698322, 0.00808465, 0.00746777, 0.00723963,
        0.00653713]),
 'rank_test_rmse': array([6, 3, 5, 2, 4, 1]),
 'split0_test_mae': array([0.14083571, 0.13190817, 0.13852581, 0.13126411, 0.13854284,
        0.13228698]),
 'split1_test_mae': array([0.15409551, 0.14424226, 0.15463209, 0.14448194, 0.15272265,
        0.14495462]),
 'split2_test_mae': array([0.14220542, 0.13193631, 0.1399277 , 0.13244258, 0.13982091,
        0.13251937]),
 'mean_test_mae': array([0.14571221, 0.13602891, 0.14436187, 0.13606288, 0.1436954

In [29]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd117141250>

# Collaborative filtering with big_subset_temp content based results.

In [34]:
all_ratings_df = pd.read_csv('../data/big_data_temp/predicted_ratings.csv')
all_ratings_df = all_ratings_df.drop(columns=["Unnamed: 0"])
all_ratings_df = all_ratings_df.dropna()

In [57]:
all_ratings_df['Rating'] = all_ratings_df['Rating'].apply(round)

In [58]:
X = np.array(all_ratings_df[['UserID','Rating']])
tup = np.where(np.isnan(X))

In [59]:
# for i,j in zip(tup[0], tup[1]):
#     print(X_tr_arr[i,j])
np.unique(tup[0])

array([], dtype=int64)

In [60]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(all_ratings_df[[ 'UserID', 'ISBN', 'Rating']], reader)

In [63]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SlopeOne(), NMF(), BaselineOnly()]:
    # SVDpp(),  NormalPredictor(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), CoClustering(), KNNBaseline()
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.436413,7.745279,0.653812
NMF,0.49638,7.657392,0.560803
SlopeOne,0.540203,0.272285,3.484848
BaselineOnly,0.548161,0.292718,0.436429


In [64]:
benchmark = []
# Iterate over all algorithms
for algorithm in [KNNBaseline()]:
    # SVDpp(),  NormalPredictor(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), 
    # CoClustering(), SVD(), SlopeOne(), NMF(), BaselineOnly()
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.414009,11.956725,140.519702


In [None]:
all_ratings_df.head()

In [112]:
trainset, testset = train_test_split(data, test_size=0.2)

In [65]:
algorithm = SVD
param_grid = {'n_factors':[100,150,250], 'n_epochs':[50, 100], 'reg_all':[0.02]} #
gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.37082483714416714
{'n_factors': 250, 'n_epochs': 100, 'reg_all': 0.02}


In [28]:
gs.cv_results

{'split0_test_rmse': array([0.38168126, 0.37803144, 0.37957745, 0.37603797, 0.38073918,
        0.37769602]),
 'split1_test_rmse': array([0.39697538, 0.39386702, 0.39900305, 0.39399337, 0.39773791,
        0.3930048 ]),
 'split2_test_rmse': array([0.38717092, 0.38034833, 0.38595682, 0.3819897 , 0.38486393,
        0.38128421]),
 'mean_test_rmse': array([0.38860919, 0.38408227, 0.38817911, 0.38400701, 0.38778034,
        0.38399501]),
 'std_test_rmse': array([0.00632608, 0.00698322, 0.00808465, 0.00746777, 0.00723963,
        0.00653713]),
 'rank_test_rmse': array([6, 3, 5, 2, 4, 1]),
 'split0_test_mae': array([0.14083571, 0.13190817, 0.13852581, 0.13126411, 0.13854284,
        0.13228698]),
 'split1_test_mae': array([0.15409551, 0.14424226, 0.15463209, 0.14448194, 0.15272265,
        0.14495462]),
 'split2_test_mae': array([0.14220542, 0.13193631, 0.1399277 , 0.13244258, 0.13982091,
        0.13251937]),
 'mean_test_mae': array([0.14571221, 0.13602891, 0.14436187, 0.13606288, 0.1436954

In [67]:
# We can now use the algorithm that yields the best rmse:
algo_n = gs.best_estimator['rmse']
algo_n.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd0e2adbe50>

### Let's compare those algorythms.

In [92]:
test_df = pd.read_csv('../data/big_data_temp/test_reviews.csv')

In [86]:
reader = Reader(rating_scale=(1, 5))
test_data = Dataset.load_from_df(test_df[[ 'UserID', 'ISBN', 'Rating']], reader)

In [83]:
# test_file = '../data/big_data_temp/test_reviews.csv'
# train_file = '../data/big_data_temp/rated_subset.csv'
# folds_files = [(train_file, test_file)]
# test_data = Dataset.load_from_folds(folds_files, reader=reader)

In [87]:
# pkf = PredefinedKFold()

# for trainset, testset in pkf.split(data):
#     predictions = algo.test(testset)
#     accuracy.rmse(predictions, verbose=True)

In [89]:
test_data

<surprise.dataset.DatasetAutoFolds at 0x7fd0e5e2a090>

In [93]:
test_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ISBN,UserID,Rating
0,4226,103,1101984597,3243,5
1,54272,207,1101870532,5549,5
2,84172,141,60845503,1881,4
3,80749,203,1416588256,2380,2
4,20505,169,345409469,2110,3
...,...,...,...,...,...
17030,55482,227,393354113,1021,5
17031,23849,269,081298840X,9645,4
17032,7431,107,125006953X,7336,4
17033,16296,173,60935588,1732,3


In [90]:
# predictions = algo.test(test_data)
# accuracy.rmse(predictions, verbose=True)

In [101]:
algo_n.predict(9645, '60845503', 4)

Prediction(uid=9645, iid='60845503', r_ui=4, est=3.7419118813801373, details={'was_impossible': False})

In [None]:
def predictions(uid, iid, r_ui)
    pred = algo.predict(uid, iid, r_ui, verbose=True)
    return pred

In [116]:
algo.predict(3243, 60845503, 5)

Prediction(uid=3243, iid=60845503, r_ui=5, est=4.124889345911292, details={'was_impossible': False})

In [119]:
algo_n.predict(3243, 393354113)

Prediction(uid=3243, iid=393354113, r_ui=None, est=3.605578214911806, details={'was_impossible': False})

In [105]:
test_df['Rating_pred_algo'] = 3.9159358556752695
test_df['Rating_pred_algo_n'] = 3.7419118813801373

In [152]:
test_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ISBN,UserID,Rating,Rating_pred_algo,Rating_pred_algo_n
0,4226,103,1101984597,3243,5,4.977582,3.741912
1,54272,207,1101870532,5549,5,3.915936,3.741912
2,84172,141,60845503,1881,4,3.915936,3.741912
3,80749,203,1416588256,2380,2,3.915936,3.741912
4,20505,169,345409469,2110,3,3.915936,3.741912
...,...,...,...,...,...,...,...
17030,55482,227,393354113,1021,5,3.915936,3.741912
17031,23849,269,081298840X,9645,4,3.915936,3.741912
17032,7431,107,125006953X,7336,4,3.915936,3.741912
17033,16296,173,60935588,1732,3,3.915936,3.741912


In [156]:
for user, isbn in zip(list(test_df['UserID']), list(test_df['ISBN'])):
    prediction = algo_n.predict(user, isbn).est
    test_df.loc[(test_df['UserID']==user) & (test_df['ISBN']==isbn), ['Rating_pred_algo_n']] = prediction

In [139]:
user = test_df['UserID'][0]
print(type(user))
isbn = test_df['ISBN'][0]
print(type(isbn))
prediction = algo.predict(user, isbn).est
# print(prediction.est)
test_df.loc[(test_df['UserID']==3243) & (test_df['ISBN']=='1101984597'),['Rating_pred_algo']] = prediction
(test_df['UserID']==3243) & (test_df['ISBN']=='1101984597') 
test_df.loc[:,['Rating_pred_algo']]

In [158]:
mse = mean_squared_error(np.array(test_df['Rating']), np.array(test_df['Rating_pred_algo']))

In [159]:
mse_n = mean_squared_error(np.array(test_df['Rating']), np.array(test_df['Rating_pred_algo_n']))

In [161]:
mse** (0.5)

0.9305592398483955

In [162]:
mse_n** (0.5)

0.9707834554193514