In [9]:
# Imports

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import SVD, SVDpp, NMF
from surprise import SlopeOne, CoClustering
from sklearn.model_selection import train_test_split


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from helpersKNNmeans import*

# Load dataset and samples into a pandas data frame
DATA_TRAIN_PATH = 'data/data_train.csv'
data_np = load_data(DATA_TRAIN_PATH)


DATA_TEST_PATH = 'data/sampleSubmission.csv'
samples = load_data(DATA_TEST_PATH)

In [11]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_np[['user_id', 'movie_id', 'rating']], reader=reader)

In [12]:
train_data_np, test_data_np = train_test_split(data_np[['user_id', 'movie_id', 'rating']], test_size = 0.2 ,random_state=1)
train_data = Dataset.load_from_df(train_data_np[['user_id', 'movie_id', 'rating']], reader=reader)

## Predictions

In [13]:
trainset = data.build_full_trainset()
sim_options = {'name':'msd','user_based': True}
knnbasic_algo = KNNBasic(k = 253, sim_options =sim_options).fit(trainset)

samples['knnbasic_user_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnbasic_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnbasic_user_submission.csv"
samples.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
trainset = data.build_full_trainset()
samples = samples[['user_id', 'movie_id']].copy()
sim_options = {'name':'msd','user_based': False}
knnbasic_algo = KNNBasic(k = 23, sim_options =sim_options).fit(trainset)

samples['knnbasic_item_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnbasic_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 5)

PATHOUT = "knnbasic_item_submission.csv"
samples.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [6]:
trainset = data.build_full_trainset()
samples = samples[['user_id', 'movie_id']].copy()
trainset = data.build_full_trainset()
sim_options = {'name':'msd','user_based': True}
knnmeans_algo = KNNWithMeans(k = 500, sim_options =sim_options).fit(trainset)

samples['knnmeans_user_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnmeans_user_submission.csv"
samples.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [14]:
trainset = data.build_full_trainset()
samples = samples[['user_id', 'movie_id']].copy()
sim_options = {'name':'pearson_baseline','user_based': False}
knnmeans_algo = KNNWithMeans(k = 108, sim_options =sim_options).fit(trainset)

samples['knnmeans_item_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)


PATHOUT = "knnmeans_item_submission.csv"
samples.to_csv(PATHOUT)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [8]:
trainset = data.build_full_trainset()
samples = samples[['user_id', 'movie_id']].copy()

sim_options = {'name':'msd','user_based': True}
knnz_algo = KNNWithZScore(k = 500, sim_options =sim_options).fit(trainset)

samples['knnzscore_user_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnzscore_user_submission.csv"
samples.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [14]:
trainset = data.build_full_trainset()
samples = samples[['user_id', 'movie_id']].copy()

sim_options = {'name':'pearson_baseline','user_based': False}
knnz_algo = KNNWithZScore(k = 108, sim_options =sim_options).fit(trainset)

samples['knnzscore_item_rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnzscore_item_submission.csv"
samples.to_csv(PATHOUT)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [5]:
trainset = train_data.build_full_trainset()
sim_options = {'name':'msd','user_based': True}
knnbasic_algo = KNNBasic(k = 253, sim_options =sim_options).fit(trainset)

test_data_np['knnbasic_user_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnbasic_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

sim_options = {'name':'msd','user_based': False}
knnbasic_algo = KNNBasic(k = 23, sim_options =sim_options).fit(trainset)

test_data_np['knnbasic_item_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnbasic_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnbasic_test.csv"
test_data_np.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [6]:
trainset = train_data.build_full_trainset()
test_data_np = test_data_np[['user_id', 'movie_id']].copy()
sim_options = {'name':'msd','user_based': True}
knnmeans_algo = KNNWithMeans(k = 500, sim_options =sim_options).fit(trainset)

test_data_np['knnmeans_user_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

sim_options = {'name':'pearson_baseline','user_based': False}
knnmeans_algo = KNNWithMeans(k = 108, sim_options =sim_options).fit(trainset)

test_data_np['knnmeans_item_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)


PATHOUT = "knnmeans_test.csv"
test_data_np.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [7]:
trainset = train_data.build_full_trainset()
test_data_np = test_data_np[['user_id', 'movie_id']].copy()

sim_options = {'name':'msd','user_based': True}
knnz_algo = KNNWithZScore(k = 500, sim_options =sim_options).fit(trainset)

test_data_np['knnzscore_user_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

sim_options = {'name':'pearson_baseline','user_based': False}
knnz_algo = KNNWithZScore(k = 108, sim_options =sim_options).fit(trainset)

test_data_np['knnzscore_item_rating'] = test_data_np[['user_id', 'movie_id']] \
.apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1).round(decimals = 4)

PATHOUT = "knnzscore_test.csv"
test_data_np.to_csv(PATHOUT)

Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
