In [1]:
import pandas as pd
import numpy as np
import time
from surprise import Dataset, Reader
from surprise import SVD,KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

##  Load and Preprocess the Ratings Data

In [2]:
df_ratings = pd.read_csv("ratings.csv")

In [3]:
df_ratings

Unnamed: 0.1,Unnamed: 0,timestamp,user_id,movie_id,rating
0,0,2025-01-02T02:32:57,102833,the+thin+red+line+1998,5
1,1,2025-01-02T22:28:58,58490,life+of+brian+1979,4
2,2,2025-01-02T22:30:05,24529,jurassic+park+1993,5
3,3,2025-01-03T02:17:50,54544,the+spy+who+loved+me+1977,4
4,4,2025-01-04T08:10:32,54544,goodfellas+1990,4
...,...,...,...,...,...
9996,9996,2025-02-02T05:28:30,32356,as+good+as+it+gets+1997,3
9997,9997,2025-02-02T13:42:14,5205,miracle+on+34th+street+1994,1
9998,9998,2025-02-02T15:26:08,185,the+silence+of+the+lambs+1991,4
9999,9999,2025-02-02T18:32:19,16862,dances+with+wolves+1990,4


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['user_id', 'movie_id', 'rating']], reader)
train,test = train_test_split(data,test_size=0.2,shuffle=True)

In [5]:
train_df = pd.DataFrame(train.all_ratings(), columns=["user_id", "movie_id", "rating"])
train = Dataset.load_from_df(train_df, reader)

## Train an SVD and KNNBasic Model

In [7]:
param_grid_knn = {
    'k': [20, 30, 40],  # Number of neighbors
    'sim_options': {
        'name': ['cosine', 'pearson'],  # Similarity metrics
        'user_based': [True, False]  # User-based or item-based filtering
    }
}

param_grid_svd = {
    'n_factors': [50, 100, 150],   # Number of latent factors
    'n_epochs': [10, 20, 30],      # Number of iterations
    'lr_all': [0.002, 0.005, 0.01],# Learning rate
    'reg_all': [0.02, 0.05, 0.1]   # Regularization parameter
}



In [8]:
knnbasic = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse', 'mae'], cv=4, n_jobs=-1)
knnbasic.fit(train)

Computing the cosine similarity matrix...Computing the cosine similarity matrix...

Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similari

In [9]:
svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse','mae'], cv=4, n_jobs=-1)
svd.fit(train)

In [10]:
# Best model & parameters
best_params_knn = knnbasic.best_params['rmse']
print("Best Parameters:", best_params_knn)

Best Parameters: {'k': 20, 'sim_options': {'name': 'pearson', 'user_based': True}}


In [11]:
# Best model & parameters
best_params_svd = svd.best_params['rmse']
print("Best Parameters:", best_params_svd)

Best Parameters: {'n_factors': 50, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.1}


In [12]:
# Train final model with best parameters
best_svd = SVD(**best_params_svd)
train = train.build_full_trainset()
start_time = time.perf_counter()
best_svd.fit(train)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed Train time for SVD: {elapsed_time} seconds")

Elapsed Train time for SVD: 0.03667660500001091 seconds


In [13]:
# Train final model with best parameters
best_knnbasic = KNNBasic(**best_params_knn)
#train = train.build_full_trainset()
start_time = time.perf_counter()
best_knnbasic.fit(train)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed Train time for KNNBasic: {elapsed_time} seconds")

Computing the pearson similarity matrix...
Done computing similarity matrix.
Elapsed Train time for KNNBasic: 0.040104511999999204 seconds


In [15]:
print('SVD Metrics:')
start_time = time.perf_counter()
preds = best_svd.test(test)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
accuracy.rmse(preds)
accuracy.mae(preds)
print(f"Elapsed Inference time for SVD: {elapsed_time} seconds")

SVD Metrics:
RMSE: 1.0957
MAE:  0.9109
Elapsed Inference time for SVD: 0.06594786999998803 seconds


In [16]:
print('KNNBasic Metrics:')
start_time = time.perf_counter()
preds = best_knnbasic.test(test)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
accuracy.rmse(preds)
accuracy.mae(preds)
print(f"Elapsed Inference time for KNNBasic: {elapsed_time} seconds")

KNNBasic Metrics:
RMSE: 1.0959
MAE:  0.9113
Elapsed Inference time for KNNBasic: 0.022046977000002244 seconds


In [17]:
# Save the optimized model
import joblib
joblib.dump(best_params_svd, "cf_svd_model.pkl")

['cf_svd_model.pkl']

In [18]:
import joblib
joblib.dump(best_params_knn, "cf_knnbasic_model.pkl")

['cf_knnbasic_model.pkl']