In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, train_test_split
from tabulate import tabulate

# Read and load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [3]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


# SVD Matrix Factorization 

In [4]:
# A list that will contain the results of the cross validation procedure
results_table = []
# Define the parameters and their respective ranges
param_grid = {
    "n_epochs": [5,20],
    "lr_all": [0.001, 0.05],
    "reg_all": [0.01, 0.5]
}

In [5]:
# Run grid search in order to check for parameters that will allow for optimal RMSE and MAE

## RUN 1: MOVIELENS
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for SVD")
gs.fit(movielens_dataset)
results_table.append(["ML100-SVD", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for SVD

Results:
Best RMSE: 0.9627817210760583
Best params for RMSE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}
Best MAE: 0.7645289775914335
Best params for MAE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}




In [6]:
## RUN 2: PDA2018
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for SVD")
gs.fit(pda_dataset)
results_table.append(["PDA-SVD", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for SVD

Results:
Best RMSE: 0.9120663745394613
Best params for RMSE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}
Best MAE: 0.7204927535006436
Best params for MAE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}




In [7]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "RMSE", "MAE"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender   | RMSE   | MAE    |
|:--------------|:-------|:-------|
| ML100-SVD     | 0.963. | 0.765. |
| PDA-SVD       | 0.912. | 0.720. |


In [8]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "RMSE", "MAE"])
results_df.to_csv("../data/svd_algorithm_results.csv")