In [7]:
#import pandas as pd
import os
from surprise import SVD, KNNBasic
from surprise import Dataset, Reader
from surprise import BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV

In [8]:
# specify where file is
filepath = os.path.expanduser('/Users/djdit/OneDrive - University of South Florida/Data Mining/Project/Movie-Recommendation-System/ml-latest-small/ratings.csv')

#Define reader (Dataset package requirement for reading  in file)
#Defining columns, seperator, rating scale, and skipping 1st row since it is headers
reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale=(1,5), skip_lines=1)

In [9]:
# Load in dataset object
data = Dataset.load_from_file(filepath, reader=reader )

SVD Algorithm:

https://surprise.readthedocs.io/en/stable/matrix_factorization.html#unbiased-note

In [10]:
# Select SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8758  0.8743  0.8690  0.8739  0.8758  0.8738  0.0025  
MAE (testset)     0.6759  0.6733  0.6652  0.6721  0.6702  0.6713  0.0036  
Fit time          3.93    3.85    3.98    3.87    3.85    3.89    0.05    
Test time         0.21    0.14    0.17    0.15    0.16    0.16    0.02    


{'test_rmse': array([0.87582919, 0.87432493, 0.86897935, 0.87385277, 0.8758324 ]),
 'test_mae': array([0.67590741, 0.67329564, 0.66521533, 0.6720993 , 0.67022498]),
 'fit_time': (3.92521595954895,
  3.8505022525787354,
  3.977987289428711,
  3.866163969039917,
  3.849642515182495),
 'test_time': (0.21100068092346191,
  0.14109230041503906,
  0.1680750846862793,
  0.14713597297668457,
  0.15614032745361328)}

In [11]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Best RMSE:  0.8902793486851757
RMSE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Best MAE =  0.6885319049994525
MAE Params:  {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [12]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9782  0.9669  0.9823  0.9846  0.9768  0.9778  0.0061  
MAE (testset)     0.7615  0.7538  0.7657  0.7672  0.7611  0.7619  0.0047  
Fit time          15.12   15.37   15.80   14.60   14.71   15.12   0.44    
Test time         6.39    6.76    6.28    5.93    6.35    6.34    0.26    


{'test_rmse': array([0.97819814, 0.96690068, 0.98234293, 0.98461566, 0.97680625]),
 'test_mae': array([0.76152959, 0.75379212, 0.76573776, 0.76719713, 0.76109443]),
 'fit_time': (15.124055862426758,
  15.36873483657837,
  15.801417589187622,
  14.59755563735962,
  14.709291458129883),
 'test_time': (6.385185718536377,
  6.755601406097412,
  6.2799293994903564,
  5.931193828582764,
  6.347393274307251)}

In [13]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [14]:
#Basic Knn using pearson similarity

sim_options = {'name': 'pearson', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9685  0.9693  0.9635  0.9748  0.9697  0.9692  0.0036  
MAE (testset)     0.7493  0.7516  0.7503  0.7596  0.7531  0.7528  0.0036  
Fit time          21.39   21.01   20.87   20.44   20.52   20.85   0.34    
Test time         6.44    6.29    6.85    6.02    6.55    6.43    0.27    


{'test_rmse': array([0.96846938, 0.96934747, 0.96353339, 0.97478407, 0.96965724]),
 'test_mae': array([0.74927395, 0.75155689, 0.75033125, 0.75956545, 0.75306085]),
 'fit_time': (21.387399435043335,
  21.005433320999146,
  20.874337434768677,
  20.444051265716553,
  20.517211437225342),
 'test_time': (6.437053918838501,
  6.28568696975708,
  6.846930980682373,
  6.0199809074401855,
  6.546799421310425)}

In [15]:
# Tune with GridSearchCV to see if results are improved
param_grid = {'n_epochs': [5,10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Show best score and parameters that gave best rmse
print("Best RMSE: ", gs.best_score['rmse'])
print("RMSE Params: ", gs.best_params['rmse'])
print("Best MAE = ", gs.best_score['mae'])
print("MAE Params: ", gs.best_params['mae'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Data Source:

F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. https://doi.org/10.1145/2827872

Data retrieved from https://grouplens.org/datasets/movielens/ on March 28, 2021

In [8]:
#Basic Knn using cosine similarity

sim_options = {'name': 'cosine', 'item_based': False}

algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9776  0.9662  0.9679  0.9701  0.9782  0.9720  0.0050  
MAE (testset)     0.7518  0.7441  0.7473  0.7463  0.7539  0.7487  0.0036  
Fit time          0.41    0.45    0.43    0.45    0.44    0.44    0.01    
Test time         1.12    1.12    1.19    1.18    1.11    1.14    0.03    


{'test_rmse': array([0.97758554, 0.96620516, 0.96791647, 0.9701301 , 0.97820016]),
 'test_mae': array([0.75176017, 0.74413448, 0.7472585 , 0.74631822, 0.753903  ]),
 'fit_time': (0.41201281547546387,
  0.4501481056213379,
  0.4341144561767578,
  0.4452364444732666,
  0.43535733222961426),
 'test_time': (1.1167235374450684,
  1.1234185695648193,
  1.1918399333953857,
  1.180553913116455,
  1.1120951175689697)}