In [1]:
import pandas as pd
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import knns, SVD, SVDpp, SlopeOne, NMF, BaselineOnly, NormalPredictor
from surprise.similarities import cosine, msd, pearson, pearson_baseline

In [2]:
df_rev5 = pd.read_csv('Data/df_rev5.csv')

In [3]:
df_rev5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1302158 entries, 0 to 1302157
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1302158 non-null  int64 
 1   overall     1302158 non-null  int64 
 2   reviewTime  1302158 non-null  object
 3   reviewerID  1302158 non-null  object
 4   asin        1302158 non-null  object
 5   reviewText  1302158 non-null  object
 6   summary     1302158 non-null  object
dtypes: int64(2), object(5)
memory usage: 69.5+ MB


In [4]:
reader = Reader(rating_scale=(1, 5))
user_data = Dataset.load_from_df(df_rev5[['reviewerID', 'asin', 'overall']], reader)

trainset, testset = train_test_split(user_data, test_size=0.2, random_state=42)

In [5]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  97580 

Number of items:  92260 



In [12]:
baseline = NormalPredictor()
baseline.fit(trainset)
predictions = baseline.test(testset)
print(accuracy.rmse(predictions))

RMSE: 1.2377
1.2377436216451305


In [11]:
baseline2 = BaselineOnly()
baseline2.fit(trainset)
predictions = baseline2.test(testset)
print(accuracy.rmse(predictions))

Estimating biases using als...
RMSE: 0.8201
0.8201106073043494


In [None]:
#sim_cos = {'name':'cosine', 'user_based':True}
#basic = knns.KNNBasic(k=137, sim_options=sim_cos)
#basic.fit(trainset)
#predictions = basic.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':True}
#basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
#basic_pearson.fit(trainset)
#predictions = basic_pearson.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':False}
#basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
#basic_pearson.fit(trainset)
#predictions = basic_pearson.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':True}
#knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
#knn_baseline.fit(trainset)
#predictions = knn_baseline.test(testset)
#print(accuracy.rmse(predictions))

In [None]:
#sim_pearson = {'name':'pearson', 'user_based':False}
#knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
#knn_baseline.fit(trainset)
#predictions = knn_baseline.test(testset)
#print(accuracy.rmse(predictions))

In [13]:
svd_basic = SVD(random_state=42)
results = cross_validate(svd_basic, user_data, measures=['RMSE'], cv=3, n_jobs = -1, verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8132  0.8134  0.8136  0.8134  0.0002  
Fit time          46.46   46.93   46.52   46.63   0.21    
Test time         4.22    4.30    4.18    4.23    0.05    


In [14]:
results

{'test_rmse': array([0.81315846, 0.81341402, 0.81357441]),
 'fit_time': (46.45669507980347, 46.92526960372925, 46.51591205596924),
 'test_time': (4.216390609741211, 4.303755044937134, 4.177614212036133)}

In [15]:
svd_basic.fit(trainset)
predictions = svd_basic.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.8084
0.8084230438494829


In [10]:
svd_param_grid = {'n_factors':[20, 40],
                  'n_epochs': [10, 20], 
                  'lr_all': [0.002, 0.005],
                  'reg_all': [0.2 ,0.4, 0.6],
                  'biased': [True, False]}
svd_gs_model = GridSearchCV(SVD,param_grid=svd_param_grid,joblib_verbose=5,cv=3)
svd_gs_model.fit(user_data)
svd_gs_model.best_params['rmse']

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   43.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   56.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 49.8min finished


{'n_factors': 20,
 'n_epochs': 20,
 'lr_all': 0.005,
 'reg_all': 0.2,
 'biased': True}

In [16]:
svd_model = SVD(n_factors=20, n_epochs=20,
                lr_all=.005, reg_all=0.2, random_state=42)
svd_model.fit(trainset)
predictions = svd_model.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.8026
0.8025831221469396


In [13]:
svd_param_grid2 = {'n_factors':[5, 10, 20],
                  'n_epochs': [20, 30, 40], 
                  'lr_all': [0.5, 0.05, .005],
                  'reg_all': [0.1, 0.2]}
svd_gs2_model = GridSearchCV(SVD,param_grid=SVD_param_grid2,joblib_verbose=5, cv=3)
svd_gs2_model.fit(user_data)
svd_gs2_model.best_params['rmse']

In [14]:
svd_gs2_model.fit(user_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   39.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   58.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed: 83.0min finished


In [15]:
svd_gs2_model.best_params['rmse']

{'n_factors': 5, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}

In [17]:
svd2_model = SVD(n_factors=5, n_epochs=40, lr_all=0.005, reg_all=0.1, random_state=42)
svd2_model.fit(trainset)
predictions = svd2_model.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.7868
0.7867689448520152


In [18]:
trainset_full = user_data.build_full_trainset()
svd2_model.fit(trainset_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dbacd251f0>

In [19]:
df_meta5 = pd.read_csv('Data/meta5.csv', index_col='asin')
df_meta5.drop(columns =['Unnamed: 0'], inplace=True)

In [20]:
df_meta5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93547 entries, B0012GTZCK to B01HJCNGZ2
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         93547 non-null  object 
 1   brand         93547 non-null  object 
 2   genre         93547 non-null  object 
 3   print_length  93547 non-null  float64
 4   word_wise     93547 non-null  int64  
 5   lending       93547 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 5.0+ MB


In [21]:
df_meta5.head()

Unnamed: 0_level_0,title,brand,genre,print_length,word_wise,lending
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1,0
B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0,1
B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0,0
B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1,1
B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0,1


In [22]:
df_user = df_rev5.set_index('reviewerID')
df_user.drop(columns=['Unnamed: 0', 'reviewTime', 'reviewText', 'summary'], inplace=True)

In [23]:
df_user

Unnamed: 0_level_0,overall,asin
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AYKIGSPZ8TLR7,5,B0012LHGJ4
AV4TJUAJJL3IV,1,B0012RMVH0
AQR816QPFU5JE,5,B0012RMVH0
A1KURTCUDMSNF4,3,B0012RMVH0
A3GDLS2QG56059,1,B0012RMVH0
...,...,...
A1M2UIBQGKEUAU,5,B01HJCNGZ2
A2GCEOF7KVQK1U,4,B01HJCNGZ2
A3ETWQJ8WRQFHX,5,B01HJCNGZ2
ARKWPH17ITFGS,5,B01HJCNGZ2


In [24]:
have_reviewed = list(df_user.loc['A9C16P3CZXWS5', 'asin'])

In [25]:
have_reviewed

['B00G5Q6MI2', 'B00KVOCLXE', 'B00MQ999AK', 'B00U4F067I', 'B005G3ZB9Q']

In [26]:
not_reviewed = df_meta5.copy()
not_reviewed.drop(have_reviewed, inplace=True)
not_reviewed.reset_index(inplace=True)

In [27]:
not_reviewed['Est_rating'] = not_reviewed['asin'].apply(lambda x: svd2_model.predict('A9C16P3CZXWS5', x).est)

In [28]:
not_reviewed

Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,Est_rating
0,B0012GTZCK,God in the White House,Randall Balmer,History,260.0,1,0,4.719201
1,B0012LHGJ4,Vingt Mille Lieues sous les mers (French Editi...,Jules Verne,Reference,606.0,0,1,4.957987
2,B0012RMVH0,Wood-Carving Design and Workmanship - Kindle e...,George Jack,Arts & Photography,310.0,0,0,3.379050
3,B0012TAD1O,Souls Night (The Vampire Pacts) - Kindle edition,Kallysten,Romance,74.0,1,1,4.841767
4,B0012U0NKE,The Misplaced Horse (1) eBook,Constance Downes,"Comics, Manga & Graphic Novels",444.0,0,1,4.822369
...,...,...,...,...,...,...,...,...
93537,B01HJA2D80,B.Oar Guests (Caverns and Creatures) - Kindle ...,Robert Bevan,Humor & Entertainment,38.0,1,1,4.611595
93538,B01HJBPMQY,Lord of the Jungle: An Erotic Adventure (Jungl...,Sheri Fredricks,Literature & Fiction,56.0,1,1,5.000000
93539,B01HJBPUWA,Chronicle Worlds,Samuel Peralta,Science Fiction & Fantasy,383.0,1,1,4.939319
93540,B01HJC63YI,The Gardella Vampire Hunters Starter Set: Vict...,Colleen Gleason,Romance,852.0,1,1,4.947419


In [26]:
not_reviewed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93542 entries, B0012GTZCK to B01HJCNGZ2
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         93542 non-null  object 
 1   brand         93542 non-null  object 
 2   genre         93542 non-null  object 
 3   print_length  93542 non-null  float64
 4   word_wise     93542 non-null  int64  
 5   lending       93542 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 5.0+ MB


In [33]:
def user_recommend_books():
    
    user = input('UserId: ')
    n_recs = int(input('How many recommendations? '))
    
    have_reviewed = list(df_user.loc[user, 'asin'])
    not_reviewed = df_meta5.copy()
    not_reviewed.drop(have_reviewed, inplace=True)
    not_reviewed.reset_index(inplace=True)
    not_reviewed['est_rating'] = not_reviewed['asin'].apply(lambda x: svd2_model.predict(user, x).est)
    not_reviewed.sort_values(by='est_rating', ascending=False, inplace=True)
    return not_reviewed.head(n_recs)

In [36]:
user_recommend_books()

UserId: A2J8ASQMHD2TH
How many recommendations? 10


Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,est_rating
68655,B00Z7KJUCW,Home and Away - Kindle edition,Samantha Wayland,Literature & Fiction,402.0,1,1,5.0
51092,B00O2ZX1LE,When You Were Pixels (Syntax Book 1) - Kindle ...,Julio Alexi Genao,Literature & Fiction,41.0,1,1,5.0
50982,B00O28EJPI,Explicit Memory - Kindle edition,Scarlett Finn,Romance,376.0,1,1,5.0
59507,B00SUPS1OY,Entropy&#39;s End (Targon Tales - Sethran Boo...,Chris Reher,Science Fiction & Fantasy,247.0,1,1,5.0
41606,B00KB1E4WI,The Fight for Us (Bristol Island Standalone Bo...,Elizabeth Finn,Literature & Fiction,345.0,1,1,5.0
6795,B006MV3AQ2,Weekend Homesteader: January - Kindle edition,Anna Hess,"Crafts, Hobbies & Home",44.0,1,1,5.0
76547,B016DZ4T5C,Calm &amp; Storm (The Night Horde SoCal Book 6...,Susan Fanetti,Literature & Fiction,440.0,1,1,5.0
74433,B0150GDJN8,Training to Love It: A Hotwife Romance - Kindl...,Kenny Wright,Literature & Fiction,224.0,1,1,5.0
14920,B00APEDT9K,Santa Reads Romance - Kindle edition,Dara Joy,Romance,78.0,1,1,5.0
39430,B00JIUR2TA,The Sacketts Volume Two 12-Book Bundle - Kindl...,Louis L'Amour,Literature & Fiction,2496.0,1,0,5.0


In [37]:
user_recommend_books()

UserId: A2UNMDJYXPEQZ3
How many recommendations? 10


Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,est_rating
46057,B00M64QIWW,John &amp; Jackie - Kindle edition,TJ Klune,Literature & Fiction,95.0,1,1,4.854224
50432,B00O0AG24Y,Inspector Hobbes and the Gold Diggers: Comedy ...,Wilkie Martin,"Mystery, Thriller & Suspense",329.0,1,1,4.790772
2993,B0053YAZ58,Hornblower Addendum - Five Stories (Hornblower...,C. S. Forester,Literature & Fiction,79.0,1,1,4.756611
58319,B00SEKT3M4,The Player (Rockliffe Book 3) - Kindle edition,Stella Riley,Literature & Fiction,327.0,1,1,4.75608
6115,B006BG7Q1I,Love Intertwined Vol. 1 - Kindle edition,Pepper Pace,Literature & Fiction,222.0,1,1,4.747956
75200,B015WXSNFI,Holy Spy: A John Shakespeare Mystery - Kindle ...,Rory Clements,"Mystery, Thriller & Suspense",485.0,1,0,4.747524
18378,B00BR70E2E,Captain Lacey Regency Mysteries Volume Two - K...,Ashley Gardner,"Mystery, Thriller & Suspense",899.0,1,1,4.740167
14566,B00AMNT24K,Princess Ahira,K. M. Shea,Children's eBooks,262.0,1,1,4.738927
72325,B013WZRWMS,Ships and Stings and Wedding Rings (The Chron...,Jodi Taylor,Science Fiction & Fantasy,51.0,1,0,4.735658
13840,B00AEEBDRG,Expectations - Kindle edition,Frances Murray,Literature & Fiction,369.0,1,1,4.728528


In [38]:
user_recommend_books()

UserId: A3IQ0P3M39IY8U
How many recommendations? 10


Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,est_rating
46063,B00M64QIWW,John &amp; Jackie - Kindle edition,TJ Klune,Literature & Fiction,95.0,1,1,4.910841
50438,B00O0AG24Y,Inspector Hobbes and the Gold Diggers: Comedy ...,Wilkie Martin,"Mystery, Thriller & Suspense",329.0,1,1,4.873136
72331,B013WZRWMS,Ships and Stings and Wedding Rings (The Chron...,Jodi Taylor,Science Fiction & Fantasy,51.0,1,0,4.870298
6096,B006BG7Q1I,Love Intertwined Vol. 1 - Kindle edition,Pepper Pace,Literature & Fiction,222.0,1,1,4.869776
18343,B00BR70E2E,Captain Lacey Regency Mysteries Volume Two - K...,Ashley Gardner,"Mystery, Thriller & Suspense",899.0,1,1,4.864052
14523,B00AMNT24K,Princess Ahira,K. M. Shea,Children's eBooks,262.0,1,1,4.86054
71364,B012WCOYQ4,The Rise of Ren Crown eBook,Anne Zoelle,Children's eBooks,449.0,1,1,4.852885
75204,B015WXSNFI,Holy Spy: A John Shakespeare Mystery - Kindle ...,Rory Clements,"Mystery, Thriller & Suspense",485.0,1,0,4.852798
58332,B00SEKT3M4,The Player (Rockliffe Book 3) - Kindle edition,Stella Riley,Literature & Fiction,327.0,1,1,4.849859
13800,B00AEEBDRG,Expectations - Kindle edition,Frances Murray,Literature & Fiction,369.0,1,1,4.820101


In [49]:
user_recommend_books()

UserId: A2VXSQHJWZAQGY
How many recommendations? 10


Unnamed: 0,asin,title,brand,genre,print_length,word_wise,lending,est_rating
45991,B00M64QIWW,John &amp; Jackie - Kindle edition,TJ Klune,Literature & Fiction,95.0,1,1,4.112198
50386,B00O0AG24Y,Inspector Hobbes and the Gold Diggers: Comedy ...,Wilkie Martin,"Mystery, Thriller & Suspense",329.0,1,1,4.083174
18322,B00BR70E2E,Captain Lacey Regency Mysteries Volume Two - K...,Ashley Gardner,"Mystery, Thriller & Suspense",899.0,1,1,4.052512
6102,B006BG7Q1I,Love Intertwined Vol. 1 - Kindle edition,Pepper Pace,Literature & Fiction,222.0,1,1,4.044863
75250,B015WXSNFI,Holy Spy: A John Shakespeare Mystery - Kindle ...,Rory Clements,"Mystery, Thriller & Suspense",485.0,1,0,4.04091
14514,B00AMNT24K,Princess Ahira,K. M. Shea,Children's eBooks,262.0,1,1,4.039301
72363,B013WZRWMS,Ships and Stings and Wedding Rings (The Chron...,Jodi Taylor,Science Fiction & Fantasy,51.0,1,0,4.035321
71392,B012WCOYQ4,The Rise of Ren Crown eBook,Anne Zoelle,Children's eBooks,449.0,1,1,4.034815
2980,B0053YAZ58,Hornblower Addendum - Five Stories (Hornblower...,C. S. Forester,Literature & Fiction,79.0,1,1,4.031554
13790,B00AEEBDRG,Expectations - Kindle edition,Frances Murray,Literature & Fiction,369.0,1,1,4.024689
