In [12]:
# Imports
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import SVD, SVDpp, NMF
from surprise import SlopeOne, CoClustering
from surprise.prediction_algorithms import predictions

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from helpersKNNmeans import*

# Load dataset and samples into a pandas data frame
DATA_TRAIN_PATH = 'data/data_train.csv'
data_np = load_data(DATA_TRAIN_PATH)


DATA_TEST_PATH = 'data/sampleSubmission.csv'
samples = load_data(DATA_TEST_PATH)

In [14]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_np[['user_id', 'movie_id', 'rating']], reader=reader)

## Cross validation between various model

In [None]:
knnbasic_cv = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False)
knnz_cv = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False)

In [None]:
print('Algorithm\t RMSE\t\t MAE')
print()
print('KNN Basic', '\t', round(knnbasic_cv['test_rmse'].mean(), 4), '\t', round(knnbasic_cv['test_mae'].mean(), 4))
print('KNN Means', '\t', round(knnmeans_cv['test_rmse'].mean(), 4), '\t', round(knnmeans_cv['test_mae'].mean(), 4))
print('KNN ZScore', '\t', round(knnz_cv['test_rmse'].mean(), 4), '\t', round(knnz_cv['test_mae'].mean(), 4))

In [None]:
x_algo = ['KNN Basic', 'KNN Means', 'KNN ZScore']
all_algos_cv = [knnbasic_cv, knnmeans_cv, knnz_cv]

rmse_cv = [round(res['test_rmse'].mean(), 4) for res in all_algos_cv]
mae_cv = [round(res['test_mae'].mean(), 4) for res in all_algos_cv]

plt.figure(figsize=(20,5))

plt.subplot(1, 2, 1)
plt.title('Comparison of Algorithms on RMSE', loc='center', fontsize=15)
plt.plot(x_algo, rmse_cv, label='RMSE', color='darkgreen', marker='o')
plt.xlabel('Algorithms', fontsize=15)
plt.ylabel('RMSE Value', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.subplot(1, 2, 2)
plt.title('Comparison of Algorithms on MAE', loc='center', fontsize=15)
plt.plot(x_algo, mae_cv, label='MAE', color='navy', marker='o')
plt.xlabel('Algorithms', fontsize=15)
plt.ylabel('MAE Value', fontsize=15)
plt.legend()
plt.grid(ls='dashed')

plt.savefig("ComparisonAlgo.png")
plt.show()

## Grid Search on the models parameters

In [15]:
# Parameter for number of closest neighbors
param_grid = {'k': [20, 30, 40, 50, 60]}

In [None]:
knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnbasic_gs.fit(data)

In [None]:
knnmeans_gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnmeans_gs.fit(data)

In [None]:
knnz_gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnz_gs.fit(data)

In [None]:
x = [15, 20, 25, 30, 40, 50, 60]
y1 = knnbasic_gs.cv_results['mean_test_rmse']
y2 = knnbasic_gs.cv_results['mean_test_mae']

y3 = knnmeans_gs.cv_results['mean_test_rmse']
y4 = knnmeans_gs.cv_results['mean_test_mae']

y5 = knnz_gs.cv_results['mean_test_rmse']
y6 = knnz_gs.cv_results['mean_test_mae']

In [None]:
plt.figure(figsize=(18,5))

# Show the variation of the RMSE with the number of closest neighbors
plt.subplot(1, 2, 1)
plt.title('K Neighbors vs RMSE', loc='center', fontsize=15)
plt.plot(x, y1, label='KNNBasic', color='lightcoral', marker='o')
plt.plot(x, y5, label='KNNWithZScore', color='indianred', marker='o')
plt.plot(x, y3, label='KNNWithMeans', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('RMSE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

# Show the variation of the MAE with the number of closest neighbors
plt.subplot(1, 2, 2)
plt.title('K Neighbors vs MAE', loc='center', fontsize=15)
plt.plot(x, y2, label='KNNBasic', color='lightcoral', marker='o')
plt.plot(x, y4, label='KNNWithMeans', color='indianred', marker='o')
plt.plot(x, y6, label='KNNWithZScore', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('MAE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

plt.savefig("KNeighbor.png")
plt.show()

## Predictions

In [None]:
trainset = data.build_full_trainset()
knnbasic_algo = KNNBasic(k = ).fit(trainset)

samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnbasic_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_knnbasic.csv"
create_csv(PATHOUT,samples)

In [None]:
knnmeans_algo = KNNWithMeans(k = ).fit(trainset)

samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_knnmeans.csv"
create_csv(PATHOUT,samples)

In [None]:
knnz_algo = KNNWithZScore(k = ).fit(trainset)

samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_knnz.csv"
create_csv(PATHOUT,samples)