In [9]:
import pandas as pd
import numpy as np
from surprise import SVD, SVDpp, KNNBasic, BaselineOnly, NormalPredictor
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, KFold, cross_validate

# Read and load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [3]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build the training sets from the data loaded above. What we are doing here is using the whole dataset
# and the build_full_trainset method is essentially converting our data into userxitem matrices
movielens_trainset = movielens_dataset.build_full_trainset()
pda_trainset = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("1) Number of items in each dataset", " ML100k:", movielens_trainset.n_items, "PDA:", pda_trainset.n_items)
print("2) Number of users in each dataset", " ML100k:", movielens_trainset.n_users, "PDA:", pda_trainset.n_users)
print("3) Number of ratings in each dataset", " ML100k:", movielens_trainset.n_ratings, "PDA:", pda_trainset.n_ratings)
print("4) Mean rating", " ML100k:", movielens_trainset.global_mean, "PDA:", pda_trainset.global_mean)
print("4) Dataset rating", " ML100k:", movielens_trainset.global_mean, "PDA:", pda_trainset.global_mean)

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775
4) Dataset rating  ML100k: 3.52986 PDA: 3.638361967321775


# Perform Grid Search

In [4]:
# Define a dict of recommender algorithms we wish to do the grid search for
algorithms = {"SVD": SVD, "KNN": KNNBasic}

# Run grid search in order to check for parameters that will allow for optimal RMSE and MAE
# Define the parameters and their respective ranges
param_grid = {
    "n_epochs": [5,20],
    "lr_all": [0.001, 0.05],
    "reg_all": [0.01, 0.5]
}

best_params = {}

for algo in algorithms:
    gs = GridSearchCV(algorithms[algo], param_grid, measures=["rmse", "mae"], cv=5)
    print("Running grid search for", algo)
    gs.fit(movielens_dataset)
    print("Results:")
    print("Best RMSE:", gs.best_score["rmse"])
    best_params[algo+"RMSE"] = gs.best_params["rmse"]
    print("Best params for RMSE", gs.best_params["rmse"])
    print("Best MAE:", gs.best_score["mae"])
    print("Best params for MAE", gs.best_params["mae"])
    best_params[algo+"MAE"] = gs.best_params["mae"]
    print("\n")

Running grid search for SVD
Results:
Best RMSE: 0.9638859056171045
Best params for RMSE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}
Best MAE: 0.7653808770918966
Best params for MAE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}


Running grid search for KNN
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
C

# Cross Validation

In [5]:
# variable declarations: the 5 fold split and a list that will contain the results of the cross validation procedure
kf = KFold(n_splits=5, random_state=0) # equal folds for all algorithms (hence random_state = 0)
table = []

## Basic algorithms

In [7]:
# Run 5 fold cross validation for the following recommender algorithms
basic_algos = (BaselineOnly, NormalPredictor)
basic_titles = ("Baseline", "Random")

fold_n = 0
# Run 5 fold cross validation
for algo in basic_algos:
    out = cross_validate(algo(), movielens_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = [basic_titles[fold_n], mean_rmse, mean_mae]
    table.append(new_line)
    fold_n += 1
    
print(table)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
[[<class 'surprise.prediction_algorithms.baseline_only.BaselineOnly'>, '0.944.', '0.748.'], [<class 'surprise.prediction_algorithms.random_pred.NormalPredictor'>, '1.522.', '1.221.']]


## KNN Collaborative Filtering

In [8]:
# Run 5 fold cross validation for the following recommender algorithms
knn_titles = ("ubKNN", "ibKNN")
ub_options = {'name': 'cosine', 'user_based': True}
ib_options = {'name': 'cosine', 'user_based': False}
knn_options = {"ubKNN": ub_options , "ibKNN": ib_options}

fold_n = 0
# Run 5 fold cross validation
for title in KNN_titles:    
    out = cross_validate(KNNBasic(sim_options=knn_options[title]), movielens_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = [title, mean_rmse, mean_mae]
    table.append(new_line)
    fold_n += 1
    
print(table)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
[[<class 'surprise.prediction_algorithms.baseline_only.BaselineOnly'>, '0.944.', '0.748.'], [<class 'surprise.prediction_algorithms.random_pred.NormalPredictor'>, '1.522.', '1.221.'], ['ubKNN', '1.017.', '0.804.'], ['ibKNN', '1.027.', '0.81

In [None]:
# Run 5 fold cross validation for the following recommender algorithms
svd_algos = (SVD, SVDpp)
svd_titles = ("SVD", "SVD++")
svd_options = {'name': 'cosine', 'user_based': True}

fold_n = 0
# Run 5 fold cross validation
for algo in svd_algos:    
    out = cross_validate(algo(n_epochs= 20, lr_all=0.001, reg_all=0.01), movielens_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = [svd_titles[fold_n], mean_rmse, mean_mae]
    table.append(new_line)
    fold_n += 1
    
print(table)