In [12]:
#import pandas as pd
import os
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

In [4]:
# specify where file is
filepath = os.path.expanduser('/Users/djdit/OneDrive - University of South Florida/Data Mining/Project/Movie-Recommendation-System/ml-latest-small/ratings.csv')

#Define reader (Dataset package requirement for reading  in file)
reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale=(1,5), skip_lines=1)

In [5]:
# Load in dataset object
data = Dataset.load_from_file(filepath, reader=reader )

SVD Algorithm:

https://surprise.readthedocs.io/en/stable/matrix_factorization.html#unbiased-note

In [11]:
# Choose SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8694  0.8790  0.8712  0.8809  0.8676  0.8736  0.0053  
MAE (testset)     0.6701  0.6740  0.6687  0.6770  0.6667  0.6713  0.0037  
Fit time          4.75    4.81    4.76    5.00    4.96    4.86    0.10    
Test time         0.23    0.16    0.16    0.16    0.15    0.17    0.03    


{'test_rmse': array([0.86939936, 0.87903857, 0.87115916, 0.88085742, 0.86761183]),
 'test_mae': array([0.67005638, 0.67401644, 0.66873739, 0.67697403, 0.66668405]),
 'fit_time': (4.753783226013184,
  4.810222148895264,
  4.757262229919434,
  4.9972007274627686,
  4.962130069732666),
 'test_time': (0.23297357559204102,
  0.15533089637756348,
  0.16458988189697266,
  0.16200017929077148,
  0.15316462516784668)}

In [9]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Best RMSE:  0.8940122340551199
RMSE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Best MAE =  0.692131520583474
MAE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [18]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9767  0.9671  0.9713  0.9840  0.9780  0.9754  0.0058  
MAE (testset)     0.7598  0.7543  0.7590  0.7669  0.7612  0.7602  0.0041  
Fit time          17.58   17.47   17.01   16.99   16.77   17.16   0.31    
Test time         7.02    6.96    7.15    7.30    7.06    7.10    0.12    


{'test_rmse': array([0.9767422 , 0.9670877 , 0.97132987, 0.98398131, 0.97798084]),
 'test_mae': array([0.7597995 , 0.75428975, 0.75900361, 0.76690401, 0.76117498]),
 'fit_time': (17.576207160949707,
  17.467678785324097,
  17.006751775741577,
  16.989104986190796,
  16.767837285995483),
 'test_time': (7.024522066116333,
  6.959027290344238,
  7.152004718780518,
  7.303139925003052,
  7.06428337097168)}

In [19]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Data Source:

F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. https://doi.org/10.1145/2827872

Data retrieved from https://grouplens.org/datasets/movielens/ on March 28, 2021