In [1]:
#import pandas as pd
import os
from surprise import SVD, KNNBasic
from surprise import Dataset, Reader
from surprise import BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
# specify where file is
filepath = os.path.expanduser('/Users/djdit/OneDrive - University of South Florida/Data Mining/Project/Movie-Recommendation-System/ml-latest-small/ratings.csv')

#Define reader (Dataset package requirement for reading  in file)
#Defining columns, seperator, rating scale, and skipping 1st row since it is headers
reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale=(1,5), skip_lines=1)

In [4]:
# Load in dataset object
data = Dataset.load_from_file(filepath, reader=reader )

SVD Algorithm:

https://surprise.readthedocs.io/en/stable/matrix_factorization.html#unbiased-note

In [11]:
# Choose SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8694  0.8790  0.8712  0.8809  0.8676  0.8736  0.0053  
MAE (testset)     0.6701  0.6740  0.6687  0.6770  0.6667  0.6713  0.0037  
Fit time          4.75    4.81    4.76    5.00    4.96    4.86    0.10    
Test time         0.23    0.16    0.16    0.16    0.15    0.17    0.03    


{'test_rmse': array([0.86939936, 0.87903857, 0.87115916, 0.88085742, 0.86761183]),
 'test_mae': array([0.67005638, 0.67401644, 0.66873739, 0.67697403, 0.66668405]),
 'fit_time': (4.753783226013184,
  4.810222148895264,
  4.757262229919434,
  4.9972007274627686,
  4.962130069732666),
 'test_time': (0.23297357559204102,
  0.15533089637756348,
  0.16458988189697266,
  0.16200017929077148,
  0.15316462516784668)}

In [9]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Best RMSE:  0.8940122340551199
RMSE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Best MAE =  0.692131520583474
MAE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [5]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9815  0.9789  0.9750  0.9708  0.9761  0.9765  0.0036  
MAE (testset)     0.7662  0.7631  0.7578  0.7557  0.7596  0.7605  0.0038  
Fit time          15.18   14.94   14.55   15.11   14.60   14.88   0.26    
Test time         5.94    6.29    5.96    5.89    5.92    6.00    0.15    


{'test_rmse': array([0.98154912, 0.97890004, 0.97500174, 0.97082251, 0.97613569]),
 'test_mae': array([0.76620128, 0.76314713, 0.757763  , 0.75573348, 0.75963838]),
 'fit_time': (15.183752536773682,
  14.941388845443726,
  14.5520179271698,
  15.11218810081482,
  14.598479270935059),
 'test_time': (5.9407055377960205,
  6.286655426025391,
  5.963624954223633,
  5.886139631271362,
  5.921049118041992)}

In [19]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [6]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'item_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9668  0.9744  0.9710  0.9726  0.9751  0.9720  0.0029  
MAE (testset)     0.7448  0.7487  0.7487  0.7486  0.7511  0.7484  0.0020  
Fit time          0.46    0.48    0.48    0.47    0.48    0.47    0.01    
Test time         1.30    1.30    1.25    1.28    1.30    1.28    0.02    


{'test_rmse': array([0.96682329, 0.97436226, 0.97101997, 0.97259629, 0.97506909]),
 'test_mae': array([0.74482789, 0.74868452, 0.74865663, 0.74856314, 0.75108339]),
 'fit_time': (0.45949530601501465,
  0.48102903366088867,
  0.47939109802246094,
  0.46973443031311035,
  0.4824814796447754),
 'test_time': (1.3036189079284668,
  1.2971348762512207,
  1.2455708980560303,
  1.2797329425811768,
  1.297943353652954)}

Data Source:

F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. https://doi.org/10.1145/2827872

Data retrieved from https://grouplens.org/datasets/movielens/ on March 28, 2021