### ML Project 2 - Recommender Systems - Surprise Library

In [1]:
from surprise import *
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

import pandas as pd 
import numpy as np
import math
from implementations import *

%load_ext autoreload
%autoreload 2

### Load Data

In [2]:
train_file = "data_train.csv"
submission_file = "sampleSubmission.csv"

trainset, testset, df, toBeSubmitted = load_data(train_file, submission_file)

### SVD

#### Tune HyperParams

In [None]:
algtype = SVD
param_grid = {'n_epochs': [30],
              'lr_all': [0.01, 0.1],
              'reg_all':[0.01, 0.1],
             }
gs = tuneHyperParams(algtype, trainset, testset, df, param_grid)

In [None]:

with open('SVD.pkl', 'wb') as f:
    pickle.dump(gs, f)

#### Cross Validate

In [None]:
#algo = SVDpp(n_epochs=30,lr_all=0.001,reg_all=0.001)
algo = SVD(n_epochs=1,lr_all=0.001,reg_all=0.001)

reader = Reader(rating_scale=(1, 5))
d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

### KnnBaseline

#### Tune HyperParams

In [None]:
algotype = KNNBaseline
param_grid = {'k': [50, 300], 'n_epochs': [30],'name': ['pearson_baseline'], 'user_based': [True]}

gs = tuneHyperParams(algotype, trainset, testset, df, param_grid)

In [None]:
with open('knnUserBased.pkl', 'wb') as f:
    pickle.dump(gs, f)

In [None]:
algotype = KNNBaseline
param_grid = {'k': [50, 300], 'n_epochs': [30],'name': ['pearson_baseline'], 'user_based': [False]}

gs = tuneHyperParams(algotype, trainset, testset, df, param_grid)

In [None]:
with open('knnItemBased.pkl', 'wb') as f:
    pickle.dump(gs, f)

#### Cross Validate

In [None]:
bsl_options = {'method': 'als','n_epochs': 20}
sim_options = {'name': 'pearson_baseline'}
algo = KNNBasic(k=300, bsl_options=bsl_options, sim_options=sim_options)

reader = Reader(rating_scale=(1, 5))
d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

### SlopeOne

In [None]:
algo = SlopeOne()

reader = Reader(rating_scale=(1, 5))
d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

### Baseline

In [None]:
algo = BaselineOnly()

reader = Reader(rating_scale=(1, 5))
d = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
cross_validate(algo, d, measures=['RMSE'], cv=5, verbose=True)

### Select Algorithm

In [None]:
# Select Algorithm
#algo = SVDpp(n_epochs=30,lr_all=0.001,reg_all=0.001)
algtype = SVD
param_grid = {'n_epochs': [50],
              'lr_all': [0.01, 0.0005, 0.001, 0.01, 0.1],
              'reg_all':[0.01, 0.0005, 0.001, 0.01, 0.1],
             }

### Tune Hyperparameters via Grid Search

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
#trainset, testset = train_test_split(data, test_size=.25, random_state=20)


gs = GridSearchCV(algtype, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

model = gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

### Fit & Predict

In [None]:
# Fit
#algo = SVD(n_factors=factor ,n_epochs=epoch, lr_all=lr_rate, reg_all=reg_rate)
algo = gs.best_estimator['rmse']

model = algo.fit(trainset)

# Predict
predictions = algo.test(testset)

In [None]:
predictions

### Submit

In [None]:
# Create Submission file
create_submission_file('submission_surprise', predictions, toBeSubmitted)

In [3]:
runSlopeOne(trainset, testset, toBeSubmitted)

Unnamed: 0,User,Movie,Rating
0,37,1,3
1,73,1,3
2,156,1,4
3,160,1,3
4,248,1,3
5,256,1,3
6,284,1,3
7,400,1,3
8,416,1,3
9,456,1,3


In [4]:
runBaselineOnly(trainset, testset, toBeSubmitted)

Estimating biases using als...


Unnamed: 0,User,Movie,Rating
0,37,1,3
1,73,1,3
2,156,1,4
3,160,1,3
4,248,1,3
5,256,1,3
6,284,1,3
7,400,1,3
8,416,1,3
9,456,1,3
