In [None]:
# Discard the output of this cell.
%%capture

!pip install scikit-surprise

In [42]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset, Reader
from surprise import accuracy
from surprise import KNNBasic, SVD, NormalPredictor
from surprise.model_selection import cross_validate, GridSearchCV

In [30]:
seed_constant = 27
np.random.seed(seed_constant)
random.seed(seed_constant)

Note: I assume that there are no restrictions on using deep learning or models from scratch, at least your team said nothing about that.

I decided to take some classical ML models, so I will use [surprise scikit](https://surpriselib.com/) that is designed for recommendation system problems. \
It will help me with:
1. Dataset handling
2. Various ready-to-use prediction algorithms such as baseline algorithms, neighborhood methods, matrix factorization-based ( SVD, PMF, SVD++, NMF)
3. Tools to evaluate, analyse and compare the algorithms’ performance. Cross-validation procedures can be run very easily using powerful CV iterators.


## Load raw data

In [19]:
DATAPATH = "../data/interim/"

In [25]:
train = pd.read_csv(DATAPATH + 'train.csv')
train.head()

Unnamed: 0,user_id,movie_title,rating
0,456,Mystery Science Theater 3000: The Movie (1996),3.0
1,891,Shine (1996),4.0
2,49,"Manchurian Candidate, The (1962)",5.0
3,561,"Princess Bride, The (1987)",4.0
4,83,Austin Powers: International Man of Mystery (1...,2.0


In [26]:
test = pd.read_csv(DATAPATH + 'test.csv')
test.head()

Unnamed: 0,user_id,movie_title,rating
0,880,Leaving Las Vegas (1995),4.0
1,543,Courage Under Fire (1996),3.0
2,393,"Thin Blue Line, The (1988)",3.0
3,267,Supercop (1992),5.0
4,297,My Fellow Americans (1996),3.0


## Build dataset

In [24]:
reader = Reader(rating_scale=(1, 5))
trainset = Dataset.load_from_df(train, reader)
testset = Dataset.load_from_df(test, reader)

## Choose the model

I decided to take three basic classifiers to compare them on a 3-fold cross-validation:
1. Normal Predictor: It predicts a random rating based on the distribution of the training set, which is assumed to be normal. 
2. SVD: It got popularized by Simon Funk during the Netflix prize and is a Matrix Factorized algorithm.
3. KNN Basic: This is a basic collaborative filtering algorithm method.

I will use [MAE](https://en.wikipedia.org/wiki/Mean_absolute_error) as accuracy metric for the predictions that is usual for recommendation systems.


In [None]:
benchmark = []

for classifier in [SVD(), NormalPredictor(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(classifier, trainset, measures=['MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp['Classifier'] = pd.Series([str(classifier).split(' ')[0].split('.')[-1]])
    benchmark.append(tmp)

In [61]:
surprise_results = pd.DataFrame(benchmark).set_index('Classifier').sort_values('test_mae')
surprise_results

Unnamed: 0_level_0,test_mae,fit_time,test_time
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
[SVD],0.753429,0.683718,0.293792
[KNNBasic],0.788998,0.302672,2.996646
[NormalPredictor],1.212464,0.07867,0.098428


We can see that SVD performs the best, so we need to find the best parameters for it using GridSearch

In [63]:
param_grid = {'n_factors': [25, 30, 35, 40], 'n_epochs': [15, 20, 25], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=3)
gs.fit(trainset)
algo = gs.best_estimator['mae']
print('Best score: {}'.format(gs.best_score['mae']))
print('Best params: {}'.format(gs.best_params['mae']))

Best score: 0.7435252213453083
Best params: {'n_factors': 40, 'n_epochs': 25, 'lr_all': 0.008, 'reg_all': 0.08}


## Fit and test

In [64]:
#Assigning values
params = gs.best_params
factors = params['mae']['n_factors']
epochs = params['mae']['n_epochs']
lr_value = params['mae']['lr_all']
reg_value = params['mae']['reg_all']

In [65]:
trainset_full = trainset.build_full_trainset()
testset_full = testset.construct_testset(testset.raw_ratings)

classifier = SVD(n_factors=factors, n_epochs=epochs,
                 lr_all=lr_value, reg_all=reg_value)
classifier.fit(trainset_full)
predictions = classifier.test(testset_full)
accuracy.mae(predictions)

MAE:  0.7292


0.7292233586943364

Save the model for future use

In [68]:
import pickle

# save
with open('../models/final_model.pkl','wb') as f:
    pickle.dump(algo,f)