# Surprise Model

In [1]:
import surprise

In [2]:
import os
# Import numpy
import numpy as np

# Import pandas for data handling
import pandas as pd
# Import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

In [3]:
reader = surprise.Reader(
    line_format='user item rating', sep=',',
    rating_scale=(1, 5), skip_lines=1)

In [4]:
dev_set = surprise.Dataset.load_from_file(
    os.path.join('data_movie_lens_100k/', 'ratings_all_development_set.csv'), reader=reader)

In [5]:
test_data = pd.read_csv('data_movie_lens_100k/ratings_masked_leaderboard_set.csv', dtype=str)
test_set = surprise.Dataset.load_from_df(test_data, reader=reader)

In [6]:
test_set_for_test = test_set.build_full_trainset().build_testset()

In [7]:
dev_set_for_fit = dev_set.build_full_trainset()

In [8]:
dev_set_for_predict = dev_set_for_fit.build_testset()

In [9]:
dev_set_for_fit.global_mean

3.529480398257623

In [10]:
dev_set_for_fit.rating_scale

(1, 5)

In [11]:
# Fit SVD model, which is like our M3
# Only difference from our M3 is that the regularization is applied slightly differently (to the per-item and per-users bias parameters)

model = surprise.SVD(n_factors=50, n_epochs=10, lr_all=0.01, random_state=0)
model.fit(dev_set_for_fit)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1da58769bb0>

In [12]:
predictions = model.test(dev_set_for_predict)

In [13]:
surprise.accuracy.mae(predictions)

MAE:  0.6134


0.6133859507648799

# GridSearched Surprise

In [14]:
param_grid = {'n_factors': [40, 50, 60], 'lr_all':[0.01, 0.05], 'n_epochs':[10, 20, 40, 80], 'reg_all':[0.05, 0.1, 1]}

In [15]:
gs = surprise.model_selection.GridSearchCV(surprise.SVD, param_grid, measures = ['mae', 'mse'], cv = 5)

In [16]:
gs.fit(dev_set)

In [19]:
print(gs.best_score['mae'])
print(gs.best_score['mse'])

0.7236893779785862
0.8395848376774502


In [20]:
print(gs.best_params['mae'])
print(gs.best_params['mse'])

{'n_factors': 60, 'lr_all': 0.01, 'n_epochs': 40, 'reg_all': 0.1}
{'n_factors': 60, 'lr_all': 0.01, 'n_epochs': 40, 'reg_all': 0.1}


In [19]:
best_model = surprise.SVD(
    n_factors=gs.best_params['mae']['n_factors'],
    lr_all=gs.best_params['mae']['lr_all'],
    n_epochs=gs.best_params['mae']['n_epochs'],
    reg_all=gs.best_params['mae']['reg_all'])

In [20]:
best_model.fit(dev_set_for_fit)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c2767e87f0>

In [41]:
#outs = best_model.test(test_set_for_test)
#print(best_model.test(test_set_for_test))

In [42]:
#print(test_data)

In [36]:
outs2 = np.zeros(10000)
i = 0
for _, row in test_data.iterrows():
    outs2[i] = best_model.predict(row.user_id, row.item_id).est
    i += 1

In [40]:
np.savetxt('predicted_ratings_leaderboard.txt', outs2)

In [81]:
#DONT RUN
#print(len([pred.est for pred in outs]))
#pred_ratings = np.asarray([pred.est for pred in outs])
#pred_ratings.shape

10000


(10000,)

In [82]:
#DONT RUN
#np.savetxt('predicted_ratings_leaderboard.txt', pred_ratings)