In [1]:
#import pandas as pd
import os
from surprise import SVD, KNNBasic
from surprise import Dataset, Reader
from surprise import BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV

In [2]:
# specify where file is
filepath = os.path.expanduser('/Users/djdit/OneDrive - University of South Florida/Data Mining/Project/Movie-Recommendation-System/ml-latest-small/ratings.csv')

#Define reader (Dataset package requirement for reading  in file)
#Defining columns, seperator, rating scale, and skipping 1st row since it is headers
reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale=(1,5), skip_lines=1)

In [3]:
# Load in dataset object
data = Dataset.load_from_file(filepath, reader=reader )

SVD Algorithm:

https://surprise.readthedocs.io/en/stable/matrix_factorization.html#unbiased-note

In [4]:
# Choose SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8742  0.8870  0.8724  0.8671  0.8682  0.8738  0.0071  
MAE (testset)     0.6715  0.6802  0.6706  0.6680  0.6689  0.6718  0.0043  
Fit time          3.49    3.47    3.47    3.52    3.60    3.51    0.05    
Test time         0.16    0.16    0.10    0.13    0.15    0.14    0.02    


{'test_rmse': array([0.87424678, 0.88701746, 0.87236883, 0.86713253, 0.8682221 ]),
 'test_mae': array([0.67151369, 0.68016774, 0.67058153, 0.66803299, 0.66892746]),
 'fit_time': (3.487036943435669,
  3.470679998397827,
  3.4676778316497803,
  3.524589776992798,
  3.5974273681640625),
 'test_time': (0.15959906578063965,
  0.15718436241149902,
  0.10386896133422852,
  0.13427948951721191,
  0.14982247352600098)}

In [5]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Best RMSE:  0.8899986449506466
RMSE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Best MAE =  0.6887468857443706
MAE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [6]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9726  0.9784  0.9801  0.9747  0.9797  0.9771  0.0030  
MAE (testset)     0.7560  0.7627  0.7637  0.7579  0.7662  0.7613  0.0038  
Fit time          13.30   13.50   13.33   13.46   13.17   13.35   0.12    
Test time         5.28    5.62    5.27    5.71    5.44    5.46    0.18    


{'test_rmse': array([0.97255735, 0.97843684, 0.98006559, 0.97471595, 0.97970387]),
 'test_mae': array([0.75599679, 0.76267593, 0.76365563, 0.75791566, 0.76618722]),
 'fit_time': (13.299304008483887,
  13.503310918807983,
  13.328357458114624,
  13.460290431976318,
  13.1718590259552),
 'test_time': (5.280069828033447,
  5.615118741989136,
  5.267477035522461,
  5.713312387466431,
  5.435458421707153)}

In [7]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [8]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'item_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9776  0.9662  0.9679  0.9701  0.9782  0.9720  0.0050  
MAE (testset)     0.7518  0.7441  0.7473  0.7463  0.7539  0.7487  0.0036  
Fit time          0.41    0.45    0.43    0.45    0.44    0.44    0.01    
Test time         1.12    1.12    1.19    1.18    1.11    1.14    0.03    


{'test_rmse': array([0.97758554, 0.96620516, 0.96791647, 0.9701301 , 0.97820016]),
 'test_mae': array([0.75176017, 0.74413448, 0.7472585 , 0.74631822, 0.753903  ]),
 'fit_time': (0.41201281547546387,
  0.4501481056213379,
  0.4341144561767578,
  0.4452364444732666,
  0.43535733222961426),
 'test_time': (1.1167235374450684,
  1.1234185695648193,
  1.1918399333953857,
  1.180553913116455,
  1.1120951175689697)}

Data Source:

F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. https://doi.org/10.1145/2827872

Data retrieved from https://grouplens.org/datasets/movielens/ on March 28, 2021