In [1]:
import pandas as pd
import numpy as np
from surprise import SVD, SVDpp, KNNBasic, BaselineOnly, NormalPredictor
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, KFold, cross_validate, train_test_split
from tabulate import tabulate

# Read and load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [3]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build train-test sets from the data loaded above
mls_train, mls_test = train_test_split(movielens_dataset, test_size=.20, random_state=0)
pda_train, pda_test = train_test_split(pda_dataset, test_size=.20, random_state=0)

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1651 PDA: 1819
2) Number of users in each dataset  ML100k: 943 PDA: 5684
3) Number of ratings in each dataset  ML100k: 80000 PDA: 376568
4) Mean rating  ML100k: 3.5290875 PDA: 3.637212933653417


# Grid Search and 5-fold cross validation

In [4]:
# variable declarations: the 5 fold split and a list that will contain the results of the cross validation procedure
results_table = []
best_params = {}
kf = KFold(n_splits=5)

## Random predictor

In [5]:
print("Running grid search for the Random (NormalPredictor) algorithm...")
out = cross_validate(NormalPredictor(), movielens_dataset, ["rmse", "mae"], kf)
mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
new_line = ["Random", mean_rmse, mean_mae]
results_table.append(new_line)

Running grid search for the Random (NormalPredictor) algorithm...


## Baseline

In [6]:
# Run grid search in order to check for parameters that will allow for optimal RMSE and MAE
# Define the parameters and their respective ranges
param_grid = {
    "bsl_options": {
        "method": ["als", "sgd"],
        "reg": [0.01, 0.5], 
        "n_epochs": [5,20]
    }
}

# Run grid search for the specified algorithm and the parameter grid
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for the BaselineOnly algorithm...")
# Fit the data to the model using the parameters of the grid search
gs.fit(movielens_dataset)
# Save the best parameters of the models and the best scores
best_params["BaselineRMSE"] = gs.best_params["rmse"]
best_params["BaselineMAE"] = gs.best_params["mae"]
results_table.append(["Baseline", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for the BaselineOnly algorithm...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biase

## KNN CF algorithms

In [7]:
# Define a dict of recommender algorithms we wish to do the grid search for
ub_options = {'name': 'cosine', 'user_based': True}
ib_options = {'name': 'cosine', 'user_based': False}
knn_options = {"ubKNN": ub_options , "ibKNN": ib_options}
knn_algorithms = {"ubKNN": KNNBasic(sim_options=ub_options) , "ibKNN": KNNBasic(sim_options=ib_options)}

param_grid = {
    "k": [5,50],
    "min_k": [1, 5],
}

gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for KNN")
gs.fit(movielens_dataset)
# Save the best parameters of the models and the best scores
best_params["knnRMSE"] = gs.best_params["rmse"]
best_params["knnMAE"] = gs.best_params["mae"]
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for KNN
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd simil

In [8]:
# Run 5 fold cross validation to see how the user-based and item-based KNN cf algorithms will perform
# We need to do this because we can't separate the two algorithms when using the GridSearchCv function
# Hence we run the cross validation for both algorithms manually and then save the respective results
knn_titles = ("ubKNN", "ibKNN")
ub_options = {'name': 'cosine', 'user_based': True}
ib_options = {'name': 'cosine', 'user_based': False}
knn_sim_options = {"ubKNN": ub_options , "ibKNN": ib_options}

fold_n = 0
# Run 5 fold cross validation
for title in knn_titles:    
    out = cross_validate(KNNBasic(k=50, min_k=1, sim_options=knn_sim_options[title]), movielens_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = [title, mean_rmse, mean_mae]
    results_table.append(new_line)
    fold_n += 1

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


## SVD

In [11]:
# Run grid search in order to check for parameters that will allow for optimal RMSE and MAE
# Define the parameters and their respective ranges
param_grid = {
    "n_epochs": [5,20],
    "lr_all": [0.001, 0.05],
    "reg_all": [0.01, 0.5]
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for SVD")
gs.fit(movielens_dataset)
# Save the best parameters of the models and the best scores
best_params["svdRMSE"] = gs.best_params["rmse"]
best_params["svdMAE"] = gs.best_params["mae"]
results_table.append(["SVD", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for SVD

Results:
Best RMSE: 0.9637806074745058
Best params for RMSE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}
Best MAE: 0.7650417309708972
Best params for MAE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}




In [13]:
results_table_headers = ["Recommender", "RMSE", "MAE"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender   | RMSE   | MAE    |
|:--------------|:-------|:-------|
| Random        | 1.520. | 1.221. |
| Baseline      | 0.941. | 0.743. |
| ubKNN         | 1.017. | 0.804. |
| ibKNN         | 1.023. | 0.809. |
| SVD           | 0.964. | 0.765. |
