### ML Project 2 - Recommender Systems - Surprise Library

In [30]:
from surprise import *
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

import pandas as pd 
import numpy as np
import math
from implementations import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [31]:
train_file = "data_train.csv"
submission_file = "sampleSubmission.csv"

trainset, testset, df, toBeSubmitted = load_data(train_file, submission_file)

In [32]:
gridSearchDic = {}

### SVD

In [None]:
algtype = SVD
param_grid = {'n_epochs': [200],
              'lr_all': [0.01, 0.0005, 0.001, 0.01, 0.1],
              'reg_all':[0.01, 0.0005, 0.001, 0.01, 0.1],
             }
gs = tuneHyperParams(algtype, trainset, testset, df, param_grid)

In [None]:
gridSearchDic["SVD"] = gs

### KnnBaseline

In [34]:
algotype = KNNBaseline
param_grid = {'k': [50, 150, 300], 'n_epochs': [200],'name': ['pearson_baseline'], 'user_based': [True, False]}

gs = tuneHyperParams(algotype, trainset, testset, df, param_grid)

In [None]:
gridSearchDic["KnnBaseline"] = gs

In [None]:
with open('SupriseGridSearchDic.pkl', 'wb') as f:
    pickle.dump(gridSearchDic, f)

### SlopeOne

In [29]:
algo = SlopeOne()
# Fit
model = algo.fit(trainset)
# Predict
predictions = algo.test(testset)

In [28]:
algo = SlopeOne()

d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0010  0.9997  0.9995  1.0005  1.0005  1.0002  0.0006  
Fit time          4.47    4.78    4.79    5.22    4.91    4.83    0.24    
Test time         20.56   19.79   19.88   21.16   19.84   20.25   0.54    


{'test_rmse': array([1.0009662 , 0.99966161, 0.9994735 , 1.00049767, 1.00045951]),
 'fit_time': (4.473947048187256,
  4.776485443115234,
  4.785937309265137,
  5.222400188446045,
  4.909484624862671),
 'test_time': (20.55575442314148,
  19.792408227920532,
  19.882184267044067,
  21.15937352180481,
  19.839527368545532)}

### Baseline

In [35]:
BaselineOnly
algo = BaselineOnly()
bsl_options = {'n_epochs': 20}

d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9985  0.9989  0.9995  1.0000  0.9999  0.9993  0.0006  
Fit time          6.08    6.97    6.89    6.90    6.82    6.73    0.33    
Test time         2.25    2.19    2.17    3.80    2.15    2.51    0.64    


{'test_rmse': array([0.9985382 , 0.99890149, 0.9994558 , 0.99998731, 0.99986172]),
 'fit_time': (6.076967716217041,
  6.968732833862305,
  6.885484933853149,
  6.9013590812683105,
  6.822945594787598),
 'test_time': (2.252380847930908,
  2.194763660430908,
  2.1709909439086914,
  3.799363136291504,
  2.1486735343933105)}

### Select Algorithm

In [18]:
# Select Algorithm
#algo = SVDpp(n_epochs=30,lr_all=0.001,reg_all=0.001)
algtype = SVD
param_grid = {'n_epochs': [50],
              'lr_all': [0.01, 0.0005, 0.001, 0.01, 0.1],
              'reg_all':[0.01, 0.0005, 0.001, 0.01, 0.1],
             }

### Tune Hyperparameters via Grid Search

In [15]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
#trainset, testset = train_test_split(data, test_size=.25, random_state=20)


gs = GridSearchCV(algtype, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

model = gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0317596993526121
{'n_epochs': 4}


### Fit & Predict

In [19]:
# Fit
#algo = SVD(n_factors=factor ,n_epochs=epoch, lr_all=lr_rate, reg_all=reg_rate)
algo = gs.best_estimator['rmse']

model = algo.fit(trainset)

# Predict
predictions = algo.test(testset)

In [None]:
predictions

### Submit

In [None]:
# Create Submission file
create_submission_file('submission_surprise', algo, predictions, toBeSubmitted)