## Creating the Final Submission

In [1]:
%load_ext autoreload
%autoreload 2

Necessary imports:

In [16]:
import numpy as np
from baselines import Baselines
from MF_SGD import MF_SGD
from MF_BSGD import MF_BSGD
from MF_ALS import MF_ALS
from surprise_models import SurpriseModels
from blending import Blending
from data import Data
from data_processing import create_submission

Specify the submission filename.

In [3]:
PREDICTIONS_FILENAME = 'Datasets/mixed_model.csv'

Optimal Weights obtained by executing 'run_blending'.

In [4]:
OPTIMAL_WEIGHTS = {'baseline_global_mean': 0.28820931,
                   'baseline_user_mean': -0.31216191,
                   'baseline_item_mean': -0.16738859,
                   'mf_sgd': -0.37163926,
                   'mf_bsgd': 0.53760117, 
                   'mf_als': 0.69963825,
                   'surprise_kNN_baseline_user': 0.0310014,
                   'surprise_kNN_baseline_item': 0.40249769,
                   'surprise_slope_one': -0.14927565,
                   'surprise_co_clustering': 0.04481663}

Set the random seed to be able to reproduce the results.

In [5]:
np.random.seed(98)

Load and prepare data.

In [6]:
data = Data()

Preparing data ...
... data is prepared.


Dictionary for the models to blend:

In [7]:
models = {'baseline_global_mean': None,
          'baseline_user_mean': None,
          'baseline_item_mean': None,
          'mf_sgd': None,
          'mf_bsgd': None, 
          'mf_als': None,
          'surprise_kNN_baseline_user': None,
          'surprise_kNN_baseline_item': None,
#          'surprise_SVD': None,
#          'surprise_SVDpp': None,
          'surprise_slope_one': None,
          'surprise_co_clustering': None}

Run Baseline models.

In [8]:
baselines = Baselines(data=data)

print('\nModelling using baseline_global_mean ...')
models['baseline_global_mean'] = baselines.baseline_global_mean()['Rating']
print('... done')

print('\nModelling using baseline_user_mean ...')
models['baseline_user_mean'] = baselines.baseline_user_mean()['Rating']
print('... done')

print('\nModelling using baseline_movie_mean ...')
models['baseline_item_mean'] = baselines.baseline_item_mean()['Rating']
print('... done')


Modelling using baseline_global_mean ...
... done

Modelling using baseline_user_mean ...
... done

Modelling using baseline_movie_mean ...
... done


Run Matrix Factorization model trained using Stochastic Gradient Descent.

In [9]:
mf_sgd = MF_SGD(data=data)

print('\nModelling using MF_SGD ...')
models['mf_sgd'] = mf_sgd.train()['Rating']
print('... done')


Modelling using MF_SGD ...
Learning the matrix factorization using SGD ...
Iteration: 1, RMSE on training set: 1.0265579188730012
Iteration: 2, RMSE on training set: 1.0183056651348181
Iteration: 3, RMSE on training set: 1.0132537729687723
Iteration: 4, RMSE on training set: 1.0086805828473377
Iteration: 5, RMSE on training set: 1.0043132752072519
Iteration: 6, RMSE on training set: 1.000359783627168
Iteration: 7, RMSE on training set: 0.9968568126457051
Iteration: 8, RMSE on training set: 0.9946566136405358
Iteration: 9, RMSE on training set: 0.9933340780944223
Iteration: 10, RMSE on training set: 0.9917155302498281
Iteration: 11, RMSE on training set: 0.9904644182505771
Iteration: 12, RMSE on training set: 0.9893631717853739
Iteration: 13, RMSE on training set: 0.9885388029827337
Iteration: 14, RMSE on training set: 0.987459642154001
Iteration: 15, RMSE on training set: 0.9870409560211364
Iteration: 16, RMSE on training set: 0.986733771931125
Iteration: 17, RMSE on training set: 0.9

Run Matrix Factorization model trained using Biased Stochastic Gradient Descent.

In [11]:
mf_bsgd = MF_BSGD(data=data)

print('\nModelling using MF_BSGD ...')
models['mf_bsgd'] = mf_bsgd.train()['Rating']
print('... done')


Modelling using MF_BSGD ...
Learning the matrix factorization using BSGD ...
Iteration: 1, RMSE on training set: 1.0026322512864938
Iteration: 2, RMSE on training set: 0.9917765062433125
Iteration: 3, RMSE on training set: 0.9867211386835338
Iteration: 4, RMSE on training set: 0.9834624799152446
Iteration: 5, RMSE on training set: 0.9816633291634889
Iteration: 6, RMSE on training set: 0.9794501686133932
Iteration: 7, RMSE on training set: 0.9783328865668891
Iteration: 8, RMSE on training set: 0.9777147633288329
Iteration: 9, RMSE on training set: 0.9770741347246457
Iteration: 10, RMSE on training set: 0.9767442643434102
Iteration: 11, RMSE on training set: 0.9761237873706873
Iteration: 12, RMSE on training set: 0.9758548831059652
Iteration: 13, RMSE on training set: 0.9755479256706967
Iteration: 14, RMSE on training set: 0.9753860658310906
Iteration: 15, RMSE on training set: 0.9751767077630397
Iteration: 16, RMSE on training set: 0.9750044375839366
Iteration: 17, RMSE on training set

Run Matrix Factorization model trained using Alternating Least Squares.

In [12]:
mf_als = MF_ALS(data=data)

print('\nModelling using MF_ALS ...')
models['mf_als'] = mf_als.train()['Rating']
print('... done')


Modelling using MF_ALS ...
Learning the matrix factorization using ALS ...
Iteration: 1, RMSE on training set: 0.9880231575079736
Iteration: 2, RMSE on training set: 0.9729650626533639
Iteration: 3, RMSE on training set: 0.9595814978734437
Iteration: 4, RMSE on training set: 0.951076795621786
Iteration: 5, RMSE on training set: 0.9467530176454012
Iteration: 6, RMSE on training set: 0.9445485280769137
Iteration: 7, RMSE on training set: 0.9433081405808821
Iteration: 8, RMSE on training set: 0.9425163726458278
Iteration: 9, RMSE on training set: 0.9419548022044056
Iteration: 10, RMSE on training set: 0.9415278645671546
Iteration: 11, RMSE on training set: 0.9411937730322201
Iteration: 12, RMSE on training set: 0.9409337174138136
Iteration: 13, RMSE on training set: 0.9407363924115948
Iteration: 14, RMSE on training set: 0.9405915259899267
Iteration: 15, RMSE on training set: 0.9404886954350787
Iteration: 16, RMSE on training set: 0.940418104893749
The training process converged to a thr

Run Models from Surprise Library.

In [13]:
surprise_models = SurpriseModels(data=data)

print('\nModelling using user based Surprise kNN Baseline ...')
models['surprise_kNN_baseline_user'] = surprise_models.kNN_baseline(k=150, 
                                                                    sim_options={'name': 'cosine', 
                                                                                 'user_based': True})['Rating']
print('... done')

print('\nModelling using item based Surprise kNN Baseline ...')
models['surprise_kNN_baseline_item'] = surprise_models.kNN_baseline(k=150, 
                                                                    sim_options={'name': 'pearson_baseline',
                                                                                 'user_based': False})['Rating']
print('... done')

print('\nModelling using Surprise SlopeOne ...')
models['surprise_slope_one'] = surprise_models.slope_one()['Rating']
print('... done')

#print('\nModelling using Surprise SVD ...')
#models['surprise_SVD'] = surprise_models.SVD()['Rating']
#print('... done')

#print('\nModelling using Surprise SVD++ ...')
#models['surprise_SVDpp'] = surprise_models.SVDpp()['Rating']
#print('... done')

print('\nModelling using Surprise Co-Clustering ...')
models['surprise_co_clustering'] = surprise_models.co_clustering()['Rating']
print('... done')


Modelling using user based Surprise kNN Baseline ...
Computing the cosine similarity matrix...
Done computing similarity matrix.
... done

Modelling using item based Surprise kNN Baseline ...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
... done

Modelling using Surprise SlopeOne ...
... done

Modelling using Surprise Co-Clustering ...
... done


Create the blended (combined) model.

In [14]:
blending = Blending(models, data.test_df['Rating'], OPTIMAL_WEIGHTS)

print('\nModelling using weighted averaging of the previous models ...')
mixed_model = blending.get_weighted_average()
print('... done')


Modelling using weighted averaging of the previous models ...
... done


Create the submissiom csv file.

In [17]:
data.test_df['Rating'] = mixed_model
print('\nCreating mixed_model.csv ...')
create_submission(data.test_df, PREDICTIONS_FILENAME)
print('... mixed_model.csv created.')


Creating mixed_model.csv ...
... mixed_model.csv created.
