## Creating the Final Submission

In [1]:
%load_ext autoreload
%autoreload 2

Necessary imports:

In [2]:
import numpy as np
from baselines import Baselines
from MF_SGD import MF_SGD
from MF_BSGD import MF_BSGD
from MF_ALS import MF_ALS
from surprise_models import SurpriseModels
from blending import Blending
from data import Data
from data_processing import create_submission

Specify the submission filename.

In [3]:
PREDICTIONS_FILENAME = 'Datasets/mixed_model_with_median.csv'

Optimal Weights obtained by executing 'run_blending'.

In [4]:
OPTIMAL_WEIGHTS = {'baseline_global_mean': 0.19271591983375924, 
                   'baseline_user_mean': -0.4337299003053698, 
                   'baseline_item_mean': -0.2985150084211989, 
                   'baseline_global_median': 0.1970668001493541, 
                   'baseline_user_median': 0.015201139195270304, 
                   'baseline_item_median': -0.0018982199858547794, 
                   'mf_sgd': -0.17923491952957243, 
                   'mf_bsgd': 0.34524178921797943, 
                   'mf_als': 0.7335166340482717, 
                   'surprise_kNN_baseline_user': 0.2586850797696792, 
                   'surprise_kNN_baseline_item': 0.35244287155037823,
#                   'surprise_SVD': None,
#                   'surprise_SVDpp': None,
                   'surprise_slope_one': -0.1812668620103521, 
                   'surprise_co_clustering': -0.0005493596729955596}

Set the random seed to be able to reproduce the results.

In [5]:
np.random.seed(98)

Load and prepare data.

In [6]:
data = Data()

Preparing data ...
... data is prepared.


Dictionary for the models to blend:

In [7]:
models = {}

Run Baseline models.

In [8]:
baselines = Baselines(data=data)

print('\nModelling using baseline_global_mean ...')
models['baseline_global_mean'] = baselines.baseline_global_mean()['Rating']
print('... done')

print('\nModelling using baseline_user_mean ...')
models['baseline_user_mean'] = baselines.baseline_user_mean()['Rating']
print('... done')

print('\nModelling using baseline_movie_mean ...')
models['baseline_item_mean'] = baselines.baseline_item_mean()['Rating']
print('... done')

print('\nModelling using baseline_global_median ...')
models['baseline_global_median'] = baselines.baseline_global_median()['Rating']
print('... done')

print('\nModelling using baseline_user_median ...')
models['baseline_user_median'] = baselines.baseline_user_median()['Rating']
print('... done')

print('\nModelling using baseline_movie_median ...')
models['baseline_item_median'] = baselines.baseline_item_median()['Rating']
print('... done')


Modelling using baseline_global_mean ...
... done

Modelling using baseline_user_mean ...
... done

Modelling using baseline_movie_mean ...
... done

Modelling using baseline_global_median ...
... done

Modelling using baseline_user_median ...
... done

Modelling using baseline_movie_median ...
... done


Run Matrix Factorization model trained using Stochastic Gradient Descent.

In [9]:
mf_sgd = MF_SGD(data=data)

print('\nModelling using MF_SGD ...')
models['mf_sgd'] = mf_sgd.train()['Rating']
print('... done')


Modelling using MF_SGD ...
Learning the matrix factorization using SGD ...
Iteration: 1, RMSE on training set: 1.0166969847476204
Iteration: 2, RMSE on training set: 1.0078950940534885
Iteration: 3, RMSE on training set: 1.0024668456228236
Iteration: 4, RMSE on training set: 0.9979098961745829
Iteration: 5, RMSE on training set: 0.9932903188031309
Iteration: 6, RMSE on training set: 0.9894890461812211
Iteration: 7, RMSE on training set: 0.9858652627655524
Iteration: 8, RMSE on training set: 0.9834909977858932
Iteration: 9, RMSE on training set: 0.9818735595145989
Iteration: 10, RMSE on training set: 0.9801002466983799
Iteration: 11, RMSE on training set: 0.9785933644363982
Iteration: 12, RMSE on training set: 0.977309942743755
Iteration: 13, RMSE on training set: 0.9763630897813994
Iteration: 14, RMSE on training set: 0.9753362850887657
Iteration: 15, RMSE on training set: 0.9747343771798103
Iteration: 16, RMSE on training set: 0.974258696324112
Iteration: 17, RMSE on training set: 0.

Run Matrix Factorization model trained using Biased Stochastic Gradient Descent.

In [10]:
mf_bsgd = MF_BSGD(data=data)

print('\nModelling using MF_BSGD ...')
models['mf_bsgd'] = mf_bsgd.train()['Rating']
print('... done')


Modelling using MF_BSGD ...
Learning the matrix factorization using BSGD ...
Iteration: 1, RMSE on training set: 1.003379578207212
Iteration: 2, RMSE on training set: 0.9919184290095188
Iteration: 3, RMSE on training set: 0.9842087930510309
Iteration: 4, RMSE on training set: 0.9797450225688004
Iteration: 5, RMSE on training set: 0.9770151336652192
Iteration: 6, RMSE on training set: 0.9744641793239239
Iteration: 7, RMSE on training set: 0.9725183722210042
Iteration: 8, RMSE on training set: 0.97106812342965
Iteration: 9, RMSE on training set: 0.9697783100895455
Iteration: 10, RMSE on training set: 0.9687095644419902
Iteration: 11, RMSE on training set: 0.9678661222401186
Iteration: 12, RMSE on training set: 0.9671183385726674
Iteration: 13, RMSE on training set: 0.9663945889625084
Iteration: 14, RMSE on training set: 0.9660136898404293
Iteration: 15, RMSE on training set: 0.9655585917239524
Iteration: 16, RMSE on training set: 0.9652041146053583
Iteration: 17, RMSE on training set: 0

Run Matrix Factorization model trained using Alternating Least Squares.

In [11]:
mf_als = MF_ALS(data=data)

print('\nModelling using MF_ALS ...')
models['mf_als'] = mf_als.train()['Rating']
print('... done')


Modelling using MF_ALS ...
Learning the matrix factorization using ALS ...
Iteration: 1, RMSE on training set: 0.9851661526276353
Iteration: 2, RMSE on training set: 0.9729761747745292
Iteration: 3, RMSE on training set: 0.9593760194193154
Iteration: 4, RMSE on training set: 0.9508124266451606
Iteration: 5, RMSE on training set: 0.9467193552466387
Iteration: 6, RMSE on training set: 0.9445146732631728
Iteration: 7, RMSE on training set: 0.9431822148388905
Iteration: 8, RMSE on training set: 0.942313112757796
Iteration: 9, RMSE on training set: 0.9417241856325538
Iteration: 10, RMSE on training set: 0.9413201368876003
Iteration: 11, RMSE on training set: 0.9410432464459695
Iteration: 12, RMSE on training set: 0.9408547030260581
Iteration: 13, RMSE on training set: 0.9407271238901533
Iteration: 14, RMSE on training set: 0.9406409959858569
Iteration: 15, RMSE on training set: 0.9405825359150393
Iteration: 16, RMSE on training set: 0.9405421472526104
Iteration: 17, RMSE on training set: 0

Run Models from Surprise Library.

In [12]:
surprise_models = SurpriseModels(data=data)

Run neighborhood models from Surprise Library.

In [13]:
print('\nModelling using user based Surprise kNN Baseline ...')
models['surprise_kNN_baseline_user'] = surprise_models.kNN_baseline(k=100, 
                                                                    sim_options={'name': 'pearson_baseline', 
                                                                                 'user_based': True})['Rating']
print('... done')

print('\nModelling using item based Surprise kNN Baseline ...')
models['surprise_kNN_baseline_item'] = surprise_models.kNN_baseline(k=300, 
                                                                    sim_options={'name': 'pearson_baseline',
                                                                                 'user_based': False})['Rating']
print('... done')


Modelling using user based Surprise kNN Baseline ...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
... done

Modelling using item based Surprise kNN Baseline ...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
... done


Run two more simpler models from Surprise.

In [14]:
print('\nModelling using Surprise SlopeOne ...')
models['surprise_slope_one'] = surprise_models.slope_one()['Rating']
print('... done')

#print('\nModelling using Surprise SVD ...')
#models['surprise_SVD'] = surprise_models.SVD()['Rating']
#print('... done')

#print('\nModelling using Surprise SVD++ ...')
#models['surprise_SVDpp'] = surprise_models.SVDpp()['Rating']
#print('... done')

print('\nModelling using Surprise Co-Clustering ...')
models['surprise_co_clustering'] = surprise_models.co_clustering()['Rating']
print('... done')


Modelling using Surprise SlopeOne ...
... done

Modelling using Surprise Co-Clustering ...
... done


Create the blended (combined) model.

In [15]:
blending = Blending(models, data.test_df['Rating'], OPTIMAL_WEIGHTS)

print('\nModelling using weighted averaging of the previous models ...')
mixed_model = blending.get_weighted_average()
print('... done')


Modelling using weighted averaging of the previous models ...
... done


Create the submission csv file.

In [16]:
data.test_df['Rating'] = mixed_model
print('\nCreating mixed_model.csv ...')
create_submission(data.test_df, PREDICTIONS_FILENAME)
print('... mixed_model.csv created.')


Creating mixed_model.csv ...
... mixed_model.csv created.
