In [73]:
import pandas as pd
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import knns, SVD, SVDpp, SlopeOne, NMF, BaselineOnly, NormalPredictor
from surprise.similarities import cosine, msd, pearson, pearson_baseline

In [2]:
df_rev5 = pd.read_csv('Data/df_rev5.csv')

In [3]:
df_rev5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1302158 entries, 0 to 1302157
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1302158 non-null  int64 
 1   overall     1302158 non-null  int64 
 2   reviewTime  1302158 non-null  object
 3   reviewerID  1302158 non-null  object
 4   asin        1302158 non-null  object
 5   reviewText  1302158 non-null  object
 6   summary     1302158 non-null  object
dtypes: int64(2), object(5)
memory usage: 69.5+ MB


In [74]:
reader = Reader(rating_scale=(1, 5))
user_data = Dataset.load_from_df(df_rev5[['reviewerID', 'asin', 'overall']], reader)

trainset, testset = train_test_split(user_data, test_size=0.2, random_state=42)

In [75]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  97580 

Number of items:  92260 



In [6]:
baseline = NormalPredictor()
baseline.fit(trainset)
predictions = baseline.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.2320
1.2319696287657125


In [7]:
baseline2 = BaselineOnly()
baseline2.fit(trainset)
predictions = baseline2.test(testset)
print(accuracy.rmse(predictions))

Estimating biases using als...
RMSE: 0.8144
0.8143691015224391


In [None]:
#sim_cos = {'name':'cosine', 'user_based':True}
#basic = knns.KNNBasic(k=137, sim_options=sim_cos)
#basic.fit(trainset)
#predictions = basic.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':True}
#basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
#basic_pearson.fit(trainset)
#predictions = basic_pearson.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':False}
#basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
#basic_pearson.fit(trainset)
#predictions = basic_pearson.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':True}
#knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
#knn_baseline.fit(trainset)
#predictions = knn_baseline.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':False}
#knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
#knn_baseline.fit(trainset)
#predictions = knn_baseline.test(testset)
#print(accuracy.rmse(predictions))

In [8]:
svd_basic = SVD(random_state=42)
results = cross_validate(svd_basic, user_data, measures=['RMSE'], cv=3, n_jobs = -1, verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8136  0.8143  0.8130  0.8136  0.0005  
Fit time          46.28   46.18   46.19   46.21   0.05    
Test time         4.16    4.29    4.19    4.22    0.06    


In [10]:
results

{'test_rmse': array([0.81580095, 0.81235495, 0.81329003]),
 'fit_time': (46.68087387084961, 46.96303391456604, 46.75060248374939),
 'test_time': (4.285319566726685, 4.280826807022095, 4.181398868560791)}

In [9]:
svd_basic.fit(trainset)
predictions = svd_basic.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.8031
0.8031025653579473


In [10]:
svd_param_grid = {'n_factors':[20, 40],
                  'n_epochs': [10, 20], 
                  'lr_all': [0.002, 0.005],
                  'reg_all': [0.2 ,0.4, 0.6],
                  'biased': [True, False]}
svd_gs_model = GridSearchCV(SVD,param_grid=svd_param_grid,joblib_verbose=5,cv=3)
svd_gs_model.fit(user_data)
svd_gs_model.best_params['rmse']

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   43.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   56.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 49.8min finished


{'n_factors': 20,
 'n_epochs': 20,
 'lr_all': 0.005,
 'reg_all': 0.2,
 'biased': True}

In [11]:
svd_model = SVD(n_factors=20, n_epochs=20, lr_all=.005, reg_all=0.2)
svd_model.fit(trainset)
predictions = svd_model.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.7970
0.7970269869132567


In [13]:
svd_param_grid2 = {'n_factors':[5, 10, 20],
                  'n_epochs': [20, 30, 40], 
                  'lr_all': [0.5, 0.05, .005],
                  'reg_all': [0.1, 0.2]}
svd_gs2_model = GridSearchCV(SVD,param_grid=SVD_param_grid2,joblib_verbose=5, cv=3)

In [14]:
svd_gs2_model.fit(user_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   39.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   58.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed: 83.0min finished


In [15]:
svd_gs2_model.best_params['rmse']

{'n_factors': 5, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}

In [72]:
svd2_model = SVD(n_factors=5, n_epochs=40, lr_all=0.005, reg_all=0.1, random_state=42)
svd2_model.fit(trainset)
predictions = svd2_model.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.7849
0.784931149849157


In [69]:
trainset_full = user_data.build_full_trainset()
svd2_model.fit(trainset_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1268d569160>

In [12]:
df_meta = pd.read_csv('Data/meta.csv', index_col='asin')
df_meta.drop(columns =['Unnamed: 0'], inplace=True)

In [27]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93547 entries, B0012GTZCK to B01HJCNGZ2
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         93547 non-null  object 
 1   brand         93547 non-null  object 
 2   genre         93547 non-null  object 
 3   print_length  93547 non-null  float64
 4   word_wise     93547 non-null  int64  
 5   lending       93547 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 7.5+ MB


In [16]:
df_meta.head()

Unnamed: 0_level_0,title,brand,genre,print_length,word_wise,lending
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1,0
B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0,1
B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0,0
B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1,1
B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0,1


In [10]:
df_user = df_rev5.set_index('reviewerID')
df_user.drop(columns=['Unnamed: 0', 'reviewTime', 'reviewText', 'summary'], inplace=True)

In [11]:
df_user

Unnamed: 0_level_0,overall,asin
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AYKIGSPZ8TLR7,5,B0012LHGJ4
AV4TJUAJJL3IV,1,B0012RMVH0
AQR816QPFU5JE,5,B0012RMVH0
A1KURTCUDMSNF4,3,B0012RMVH0
A3GDLS2QG56059,1,B0012RMVH0
...,...,...
A1M2UIBQGKEUAU,5,B01HJCNGZ2
A2GCEOF7KVQK1U,4,B01HJCNGZ2
A3ETWQJ8WRQFHX,5,B01HJCNGZ2
ARKWPH17ITFGS,5,B01HJCNGZ2


In [None]:
df_meta = pd.read_csv('Data/meta.csv', index_col='asin')
df_meta.drop(columns =['Unnamed: 0'], inplace=True)

In [23]:
have_reviewed = list(df_user.loc['A9C16P3CZXWS5', 'asin'])

In [24]:
have_reviewed

['B00G5Q6MI2', 'B00KVOCLXE', 'B00MQ999AK', 'B00U4F067I', 'B005G3ZB9Q']

In [40]:
not_reviewed = df_meta.copy()
not_reviewed.drop(have_reviewed, inplace=True)
not_reviewed.reset_index(inplace=True)

In [41]:
not_reviewed['Est_rating'] = not_reviewed['asin'].apply(lambda x: svd2_model.predict('A9C16P3CZXWS5', x).est)

In [42]:
not_reviewed

Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,Est_rating
0,B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1,0,4.719201
1,B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0,1,4.957987
2,B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0,0,3.379050
3,B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1,1,4.841767
4,B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0,1,4.822369
...,...,...,...,...,...,...,...,...
93537,B01HJA2D80,B.Oar Guests (Caverns and Creatures) - Kindle ...,Robert Bevan,Humor & Entertainment,38.0,1,1,4.611595
93538,B01HJBPMQY,Lord of the Jungle: An Erotic Adventure (Jungl...,Sheri Fredricks,Literature & Fiction,56.0,1,1,5.000000
93539,B01HJBPUWA,Chronicle Worlds,Samuel Peralta,Science Fiction & Fantasy,383.0,1,1,4.939319
93540,B01HJC63YI,The Gardella Vampire Hunters Starter Set: Vict...,Colleen Gleason,Romance,852.0,1,1,4.947419


In [26]:
not_reviewed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93542 entries, B0012GTZCK to B01HJCNGZ2
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         93542 non-null  object 
 1   brand         93542 non-null  object 
 2   genre         93542 non-null  object 
 3   print_length  93542 non-null  float64
 4   word_wise     93542 non-null  int64  
 5   lending       93542 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 5.0+ MB


In [None]:
titles['Estimate_Score'] = titles['Movie_Id'].apply(lambda x: svd.predict(785314, x).est)

In [70]:
def User_recommend_books(reviewerID):
    have_reviewed = list(df_user.loc[reviewerID, 'asin'])
    not_reviewed = df_meta.copy()
    not_reviewed.drop(have_reviewed, inplace=True)
    not_reviewed.reset_index(inplace=True)
    not_reviewed['est_rating'] = not_reviewed['asin'].apply(lambda x: svd2_model.predict('A9C16P3CZXWS5', x).est)
    not_reviewed.sort_values(by='est_rating', ascending=False, inplace=True)
    return not_reviewed.head(10)

In [71]:
User_recommend_books('A3IQ0P3M39IY8U')

Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,est_rating
55159,B00QEGRSL8,A Knight&#39;s Christmas Wish: A Medieval Nove...,Lana Williams,Romance,116.0,1,1,5.0
26535,B00EWSKX8K,"The Essential Dickens: A Tale of Two Cities, A...",Charles Dickens,Literature & Fiction,3486.0,1,1,5.0
23312,B00DLJPM9G,Killing Bliss: The Bliss Legacy - Book 1 - Kin...,EC Sheedy,"Mystery, Thriller & Suspense",272.0,1,1,5.0
32611,B00HE37QX0,"Blood, Fire, and Thorn (Harbinger of Doom - V...",Glenn G. Thater,Literature & Fiction,290.0,1,1,5.0
72109,B013PWN2Q8,"Amish Country Tours 2 (Amish Country Tours, Am...",Rachel Stoltzfus,Religion & Spirituality,187.0,1,1,5.0
21048,B00CNQ7HLO,Paladins of Shannara: The Black Irix (Short St...,Terry Brooks,Literature & Fiction,44.0,1,0,5.0
32606,B00HE0FPFE,Santa and the Border Collie - Kindle edition,Angelo Dirks,Literature & Fiction,20.0,1,1,5.0
89064,B01ENNZCQQ,Loose Lips: Dusty Deals Mystery Series: Book 5...,Rae Davies,"Mystery, Thriller & Suspense",220.0,1,1,5.0
3848,B005ES9MHU,Unruly Magic (Stella Mayweather Series Book 2...,Camilla Chafer,Science Fiction & Fantasy,354.0,1,1,5.0
38382,B00J9KEBFC,Trumpet&#39;s Song (Book 3 of The Eden Projec...,DP Fitzsimons,Science Fiction & Fantasy,268.0,1,1,5.0
