# Collaborative Filtering-Based Recommender System Using Surprise


Import Libraries

In [22]:
# ------------- General ------------- #
import pandas as pd

# ------------- Collaborative Filtering ------------- #
from surprise import KNNBasic, NMF
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

Load Data

In [23]:
rating_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-ML0321EN-Coursera/labs/v2/module_3/ratings.csv"
rating_df = pd.read_csv(rating_url)

In [24]:
rating_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,5
1,1342067,CL0101EN,3
2,1990814,ML0120ENv3,5
3,380098,BD0211EN,5
4,779563,DS0101EN,3


Transform to Sparse Dataset

In [25]:
rating_sparse_df = rating_df.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)
rating_sparse_df.head()

Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,4.0,0.0,0.0,5.0,4.0,0.0,5.0,3.0,...,0.0,5.0,0.0,4.0,0.0,3.0,3.0,0.0,5.0,0.0
1,4,0.0,0.0,0.0,0.0,5.0,3.0,4.0,5.0,3.0,...,0.0,4.0,0.0,0.0,0.0,3.0,3.0,0.0,3.0,3.0
2,5,3.0,5.0,5.0,0.0,4.0,0.0,0.0,0.0,3.0,...,0.0,0.0,4.0,4.0,4.0,4.0,4.0,5.0,0.0,3.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# save ratings
rating_df.to_csv("course_ratings.csv", index=False)

# read course ratings
reader = Reader(
    line_format='user item rating', sep=',', skip_lines=1, rating_scale=(2, 3))

# load from csv course_ratings
course_dataset = Dataset.load_from_file("course_ratings.csv", reader=reader)

Splitting train and test sets

In [27]:
trainset, testset = train_test_split(course_dataset, test_size=.3)

In [28]:
print(f"Total {trainset.n_users} users and {trainset.n_items} items in the training set")

Total 31275 users and 122 items in the training set


## KNN-based Collaborative Filtering


User Based KNN

In [37]:
# param grid
param_grid_knn_user = {
    # similarity options
    'sim_options': {
        'name': ['msd'],  # similarity metric
        'user_based': [True]  
    },
    'k': [10],  # neighbour 
}

# grid search
grid_search_knn_user = GridSearchCV(
    KNNBasic, 
    param_grid_knn_user,
    measures=['rmse'],  
    cv=3,  
    n_jobs=1,  
)

# fit grid
grid_search_knn_user.fit(course_dataset)

# results
print("Best RMSE Score:", grid_search_knn_user.best_score['rmse'])
print("Best Parameters:", grid_search_knn_user.best_params['rmse'])

# create a new model with the best parameters
best_params = grid_search_knn_user.best_params['rmse']
best_model_knn_user = KNNBasic(**best_params)

# fit on training set
best_model_knn_user.fit(trainset)

# predict on test set
predictions_knn_user = best_model_knn_user.test(testset)

# rmse
rmse_knn_user = accuracy.rmse(predictions_knn_user)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Best RMSE Score: 1.289513370864603
Best Parameters: {'sim_options': {'name': 'msd', 'user_based': True}, 'k': 10}
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2866


Item Based KNN

In [38]:
# param grid
param_grid_knn_item = {
    # similarity options
    'sim_options': {
        'name': ['msd'],  
        'user_based': [False]  
    },
    'k': [10],  
}

# grid search
grid_search_knn_item = GridSearchCV(
    KNNBasic, 
    param_grid_knn_item,
    measures=['rmse'],  
    cv=3,  
    n_jobs=1,  
)

# fit grid
grid_search_knn_item.fit(course_dataset)

# results
print("Best RMSE Score:", grid_search_knn_item.best_score['rmse'])
print("Best Parameters:", grid_search_knn_item.best_params['rmse'])

# create a new model with the best parameters
best_params = grid_search_knn_item.best_params['rmse']
best_model_knn_item = KNNBasic(**best_params)

# fit on training set
best_model_knn_item.fit(trainset)

# predict on test set
predictions_knn_item = best_model_knn_item.test(testset)

# rmse
rmse_knn_item = accuracy.rmse(predictions_knn_item)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Best RMSE Score: 1.289513128591097
Best Parameters: {'sim_options': {'name': 'msd', 'user_based': False}, 'k': 10}
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2866


With KNN, it seems like a user-based collaborative filtering system is the most effective.

#### Non-Negative Matrix Factorisation (NMF) Based Collaborative Filtering

In [39]:
# param grid for NMF
param_grid_nmf = {
    'n_factors': [35, 40],  # factors
    'n_epochs': [50],  # iterations
    'reg_pu': [0.1],  # regularisation users
    'reg_qi': [0.1],  # regularisation items
    'random_state': [123],
    'init_low': [0.5],
    'init_high': [5.0]
}

# grid search
grid_search_nmf = GridSearchCV(
    NMF,
    param_grid_nmf,
    measures=['rmse'],
    cv=3,
    n_jobs=2,
)

# fit grid search
grid_search_nmf.fit(course_dataset)

# results
print("\nBest RMSE Score:", grid_search_nmf.best_score['rmse'])
print("Best Parameters:", grid_search_nmf.best_params['rmse'])

# best model nmf
best_model_nmf = grid_search_nmf.best_estimator['rmse']

# fit best model
best_model_nmf.fit(trainset)

# predictions
predictions_nmf = best_model_nmf.test(testset)

# rmse on test set
rmse_nmf = accuracy.rmse(predictions_nmf)


Best RMSE Score: 1.2901388595105105
Best Parameters: {'n_factors': 40, 'n_epochs': 50, 'reg_pu': 0.1, 'reg_qi': 0.1, 'random_state': 123, 'init_low': 0.5, 'init_high': 5.0}
RMSE: 1.2871


Both methods returned similar RMSE scores on the test set, suggesting they are both equally good models to implement collaborative filtering.