In [1]:
!pip install numpy==1.23.5 -q
!pip uninstall scikit-surprise -y -q
!pip install scikit-surprise -q

import pandas as pd
from surprise import SVD, Dataset, Reader, BaselineOnly
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from collections import defaultdict
import numpy as np # Ensure numpy is imported

## loading the dataset
you can get the latest data from [keggle](https://www.kaggle.com/datasets/zygmunt/goodbooks-10k?select=books.csv)

In [3]:
# Load the datasets
try:
    ratings = pd.read_csv('ratings.csv')
    books   = pd.read_csv('books.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: 'ratings.csv' or 'books.csv' not found. Please upload them.")

Datasets loaded successfully.


In [5]:
# Merge ratings with book titles for easier interpretation later
data = pd.merge(ratings, books[['book_id', 'title']], on='book_id', how='left')
# print("Merged Data Head:")
# print(data.head())

## Basic statstic and filtering

In [7]:
print("Initial Stats:")
print(f"Total ratings: {data.shape[0]}")
print(f"Unique users: {data['user_id'].nunique()}")
print(f"Unique books: {data['book_id'].nunique()}")

# Filtering thresholds
min_user_ratings = 10
min_book_ratings = 5

# Filter users
user_counts = data['user_id'].value_counts()
data = data[data['user_id'].isin(user_counts[user_counts >= min_user_ratings].index)]

# Filter books
book_counts = data['book_id'].value_counts()
data = data[data['book_id'].isin(book_counts[book_counts >= min_book_ratings].index)]

print("\nStats After Filtering:")
print(f"Total ratings: {data.shape[0]}")
print(f"Unique users: {data['user_id'].nunique()}")
print(f"Unique books: {data['book_id'].nunique()}")

Initial Stats:
Total ratings: 981756
Unique users: 53424
Unique books: 10000

Stats After Filtering:
Total ratings: 857533
Unique users: 24405
Unique books: 9998


In [7]:
# filtering users and books with less than a set threshold of interactions
min_user_ratings = 10
min_book_ratings = 5

user_counts = data['user_id'].value_counts()
book_counts = data['book_id'].value_counts()

data = data[data['user_id'].isin(user_counts[user_counts >= min_user_ratings].index)]
data = data[data['book_id'].isin(book_counts[book_counts >= min_book_ratings].index)]

## preparing data for surprise

In [8]:
reader = Reader(rating_scale=(1, 5))

surprise_data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader)
print("Data prepared for Surprise library.")

Data prepared for Surprise library.


## Splitting

In [10]:
trainset, testset = surprise_train_test_split(surprise_data, test_size=0.2, random_state=42)
print(f"Trainset size: {trainset.n_ratings}")
print(f"Testset size: {len(testset)}")

Trainset size: 686026
Testset size: 171507


## hyperparameter tuning using grid search

In [12]:
param_grid_svd = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

# Set up GridSearchCV
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=2)

In [13]:
gs_svd.fit(surprise_data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  6.7min finished


## Print the best score and parameters

In [14]:
print(f"\nBest SVD RMSE score (on CV): {gs_svd.best_score['rmse']:.4f}")
print(f"Best SVD MAE score (on CV): {gs_svd.best_score['mae']:.4f}")
print("Best SVD parameters found:", gs_svd.best_params['rmse'])


Best SVD RMSE score (on CV): 0.8250
Best SVD MAE score (on CV): 0.6426
Best SVD parameters found: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [15]:
# Get the best SVD estimator model found by GridSearchCV
best_svd_model = gs_svd.best_estimator['rmse']

## train and evaluate the best svd model

In [16]:
# Train the best SVD model
best_svd_model.fit(trainset)

# Evaluate the best SVD model
svd_predictions = best_svd_model.test(testset)

svd_rmse = accuracy.rmse(svd_predictions)
svd_mae = accuracy.mae(svd_predictions)
print(f"Best SVD Test RMSE: {svd_rmse:.4f}")
print(f"Best SVD Test MAE: {svd_mae:.4f}")

RMSE: 0.8181
MAE:  0.6358
Best SVD Test RMSE: 0.8181
Best SVD Test MAE: 0.6358


## train and evaluate baseline model

In [17]:
bsl_options = {'method': 'als', 'n_epochs': 10} # Using ALS optimization

baseline_model = BaselineOnly(bsl_options=bsl_options)
baseline_model.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7e8a6e449190>

In [18]:
baseline_predictions = baseline_model.test(testset)

baseline_rmse = accuracy.rmse(baseline_predictions)
baseline_mae = accuracy.mae(baseline_predictions)
print(f"Baseline Test RMSE: {baseline_rmse:.4f}")
print(f"Baseline Test MAE: {baseline_mae:.4f}")

RMSE: 0.8377
MAE:  0.6615
Baseline Test RMSE: 0.8377
Baseline Test MAE: 0.6615


In [19]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1.0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1.0

    # Average precision and recall over all users
    average_precision = sum(prec for prec in precisions.values()) / len(precisions)
    average_recall = sum(rec for rec in recalls.values()) / len(recalls)

    return average_precision, average_recall

## ranking matrices for best svd model

In [20]:
k_value = 10
relevance_threshold = 3.5

print(f"\nCalculating Precision and Recall @ k={k_value} for Best SVD Model (threshold={relevance_threshold})...")
svd_avg_precision, svd_avg_recall = precision_recall_at_k(svd_predictions, k=k_value, threshold=relevance_threshold)

print(f"Average Precision@{k_value}: {svd_avg_precision:.4f}")
print(f"Average Recall@{k_value}: {svd_avg_recall:.4f}")


Calculating Precision and Recall @ k=10 for Best SVD Model (threshold=3.5)...
Average Precision@10: 0.7679
Average Recall@10: 0.7928


## top n recommendations

In [21]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True) # Sort by estimated rating
        top_n[uid] = user_ratings[:n]

    return top_n