In [4]:
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [14]:
user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
user.columns = ['userID', 'Location', 'Age']
rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
rating.columns = ['userID', 'ISBN', 'bookRating']
df = pd.merge(user, rating, on='userID', how='inner')
df.drop(['Location', 'Age'], axis=1, inplace=True)
df.head()

Unnamed: 0,userID,ISBN,bookRating
0,2,195153448,0
1,7,34542252,0
2,8,2005018,5
3,8,60973129,0
4,8,374157065,0


In [15]:
df.groupby('ISBN')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]

Unnamed: 0,ISBN,bookRating
247408,0971880107,2502
47371,0316666343,1295
83359,0385504209,883
9637,0060928336,732
41007,0312195516,723
101670,044023722X,647
166705,0679781587,639
28153,0142001740,615
166434,067976402X,614
153620,0671027360,586


In [16]:
df.groupby('userID')['bookRating'].count().reset_index().sort_values('bookRating', ascending=False)[:10]

Unnamed: 0,userID,bookRating
4213,11676,13602
74815,198711,7550
58113,153662,6109
37356,98391,5891
13576,35859,5850
80185,212898,4785
105111,278418,4533
28884,76352,3367
42037,110973,3100
88584,235105,3067


In [17]:
min_book_ratings = 50
filter_books = df['ISBN'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 50
filter_users = df['userID'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['ISBN'].isin(filter_books)) & (df['userID'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(1149780, 3)
The new data frame shape:	(140516, 3)


In [18]:
df_new

Unnamed: 0,userID,ISBN,bookRating
394,243,0060915544,10
395,243,0060977493,7
397,243,0156006529,0
400,243,0316096199,0
401,243,0316601950,9
...,...,...,...
1149714,278843,0679412956,8
1149716,278843,0684874350,0
1149722,278843,0767902890,9
1149723,278843,0786881852,8


### Surprise

In [19]:
reader = Reader(rating_scale=(0, 9))

In [10]:
data = Dataset.load_from_df(df_new, reader)

In [32]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp
import surprise.prediction_algorithms as spa
#from surprise.prediction_algorithms import SlopeOne
#from surprise.prediction_algorithms import NMF
#from surprise.prediction_algorithms.matrix_factorization import NormalPredictor

In [33]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), spa.SlopeOne(), spa.NMF(), spa.NormalPredictor(), spa.KNNBaseline(), spa.KNNBasic(), spa.KNNWithMeans(), spa.KNNWithZScore(), spa.BaselineOnly(), spa.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SlopeOne,0.502765,0.000108,4e-05
KNNWithMeans,0.596635,0.000304,4.6e-05
KNNWithZScore,1.239942,0.000388,5.6e-05
KNNBasic,1.287702,0.00011,5e-05
CoClustering,1.329494,0.001069,2.8e-05
NMF,1.366617,0.000781,3.8e-05
SVDpp,1.375959,0.000797,0.000182
BaselineOnly,1.382317,0.000378,1.9e-05
KNNBaseline,1.508633,0.000234,4.4e-05
NormalPredictor,1.562811,4e-05,2.7e-05


In [34]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

NameError: name 'BaselineOnly' is not defined