In [1]:
# Import Necessary Libraries

import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
#from surprise import SVD
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV


In [2]:
# Import and Preview 'clean_df.csv'

df = pd.read_csv('cleaned_df.csv')
df.head()

Unnamed: 0,user_id,book_id,rating,authors,title,language_code,average_rating,tag
0,1,258,5,"Carlos Ruiz Zafón, Lucia Graves",The Shadow of the Wind (The Cemetery of Forgot...,eng,4.24,wish-list
1,2,4081,4,Tom Wolfe,I am Charlotte Simmons,en-US,3.4,young-adult
2,2,260,5,Dale Carnegie,How to Win Friends and Influence People,eng,4.13,تنمية-بشرية
3,2,9296,5,"Alice Miller, Ruth Ward",The Drama of the Gifted Child: The Search for ...,en-GB,4.09,wish-list
4,2,2318,3,"Thomas J. Stanley, William D. Danko",The Millionaire Next Door: The Surprising Secr...,eng,4.0,wish-list


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5976479 entries, 0 to 5976478
Data columns (total 8 columns):
user_id           int64
book_id           int64
rating            int64
authors           object
title             object
language_code     object
average_rating    float64
tag               object
dtypes: float64(1), int64(3), object(4)
memory usage: 364.8+ MB


# Collaborative Methods

In [4]:
# Collaborative Methods only utilize three features (user id, item id and rating) so we will only load in
# 'user_id', 'book_id' and 'rating' into the dataset
# Read in values as Surprise Dataset

reader = Reader()
data = Dataset.load_from_df(df[['user_id','book_id','rating']],reader)

In [5]:
# Split data into training and testing sets

train, test = train_test_split(data, test_size = .2)

In [6]:
# View the size of the test set and what it contains

print(len(test))
print(test[0]) # Format is (user_id, book_id, rating)

1195296
(7383, 30, 5.0)


## Memory-Based/Neighborhood-Based Collaborative Filtering

### Similarity Metrics

In [7]:
# When modeling we need to use either user-user similarity or item-tem similarity
# Since there are less items than there are users we will use item-item similarity to save computation time

print('Number of users: ',train.n_users)
print('Number of items: ',train.n_items)

Number of users:  53424
Number of items:  10000


In [8]:
# Create parameters for models with different similarity metrics, cosine and pearson
# Since we are using item-item similarity, 'user_based' is set to False

cosine_sim = {'name':'cosine','user_based':False}
pearson_sim = {'name':'pearson','user_based':False}

In [9]:
# Create function to run models

def run_model(model_type, sim_metric):
    model = model_type(sim_options=sim_metric)
    model.fit(train)
    model.summary()
    predictions = model.test(test)
    accuracy.rmse(predictions)

### Model: Basic KNN

#### Basic KNN with Cosine Similarity

In [11]:
# Run the Basic KNN model with Cosine Similarity

run_model(knns.KNNBasic, cosine_sim)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8820


#### KNN Basic with Pearson Similarity

In [12]:
# Run the Basic KNN model with Pearson Similarity

run_model(knns.KNNBasic, pearson_sim)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8723


### Model: KNN with Means

#### KNN with Means with Cosine Similarity

In [13]:
# Run the KNN with Means model with Cosine Similarity

run_model(knns.KNNWithMeans, cosine_sim)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8480


#### KNN with Means with Pearson Similarity

In [14]:
# Run the KNN with Means model with Pearson Similarity

run_model(knns.KNNWithMeans, pearson_sim)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8404


### Model: KNN Baseline

#### KNN Baseline with Cosine Similarity

In [15]:
# Run the KNN Baseline model with Cosine Similarity

run_model(knns.KNNBaseline, cosine_sim)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8442


#### KNN Baseline with Pearson Similarity

In [16]:
# Run the KNN Baseline model with Cosine Similarity

run_model(knns.KNNBaseline, pearson_sim)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8369


## Model-Based (Matrix Factorization) Collaborative Filtering

### Model: Singular Value Decomposition (SVD)

In [17]:
# Run a baseline SVD Model

svd = SVD()
svd.fit(train)
predictions = svd.test(test)
accuracy.rmse(predictions)

RMSE: 0.8301


0.8301182069151432

In [18]:
#making a prediction for user 34 and item 25 
user_34_prediction = svd.predict('34','25')
user_34_prediction

Prediction(uid='34', iid='25', r_ui=None, est=3.9193218498434383, details={'was_impossible': False})

In [19]:
user_34_prediction[3]

3.9193218498434383

In [20]:
# View an example prediction

uid = str(196)  # user_id 196
iid = str(302)  # book_id 302

# Predict how user 196 (user_id=196) would rate book 302 (book_id=302)

pred = svd.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 3.92   {'was_impossible': False}
