## Import Module

In [18]:
import pandas as pd
from surprise import Reader

## Examine Dataset

In [19]:
# read dataset
book_ratings = pd.read_csv('goodreads_ratings.csv')

# check first 5 rows
print(book_ratings.head())

                            user_id   book_id  \
0  d089c9b670c0b0b339353aebbace46a1   7686667   
1  6dcb2c16e12a41ae0c6c38e9d46f3292  18073066   
2  244e0ce681148a7586d7746676093ce9  13610986   
3  73fcc25ff29f8b73b3a7578aec846394  27274343   
4  f8880e158a163388a990b64fec7df300  11614718   

                          review_id  rating  \
0  3337e0e75701f7f682de11638ccdc60c       3   
1  7201aa3c1161f2bad81258b6d4686c16       5   
2  07a203f87bfe1b65ff58774667f6f80d       5   
3  8be2d87b07098c16f9742020ec459383       1   
4  a29c4ba03e33ad073a414ac775266c5f       4   

                                         review_text  \
0  Like Matched, this book felt like it was echoi...   
1  WOW again! 4,5 Stars \r\n So i wont forget to ...   
2  The second novel was hot & heavy. Not only in ...   
3  What a maddening waste of time. And I unfortun...   
4  4.5 stars! \r\n This was an awesome read! \r\n...   

                       date_added                    date_updated  \
0  Fri Apr 29 14

## Check Dataset Info

In [20]:
# check dataset info 
print(book_ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       3500 non-null   object
 1   book_id       3500 non-null   int64 
 2   review_id     3500 non-null   object
 3   rating        3500 non-null   int64 
 4   review_text   3500 non-null   object
 5   date_added    3500 non-null   object
 6   date_updated  3500 non-null   object
 7   read_at       3167 non-null   object
 8   started_at    2395 non-null   object
 9   n_votes       3500 non-null   int64 
 10  n_comments    3500 non-null   int64 
dtypes: int64(4), object(7)
memory usage: 300.9+ KB
None


## Check Target Column (Rating)

In [21]:
# check distinct value and count
print(book_ratings['rating'].value_counts())

# filter ratings that are 0 
book_ratings = book_ratings[book_ratings['rating']!=0]

4    1278
5    1001
3     707
2     269
1     125
0     120
Name: rating, dtype: int64


## Load Dataset into Surprise

In [22]:
# load reader
from surprise import Reader
reader = Reader(rating_scale=(1, 5))

# load df
from surprise import Dataset
df = Dataset.load_from_df(book_ratings[['user_id', 'book_id', 'rating']], reader)

## Split into Train/Test 

In [23]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(df, test_size=.2) #80% Train, 20% Test

## Model Training

In [24]:
from surprise import KNNBasic

model = KNNBasic()
model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x217c579ae80>

## Model Evaluation

In [25]:
from surprise import accuracy
predictions = model.test(testset)
accuracy.rmse(predictions)

# Model is not that good as the variation of prediction is around 1.07

RMSE: 1.0913


1.0913110633816907

## Check Prediction

In [26]:
print(model.predict('8842281e1d1347389f2ab93d60773d4d', '18007564').est)

#model predict that this user will give 3.82 rating to the book 18007564

3.8191568047337277
