In [5]:
# Imports
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.search import GridSearchCV
from surprise import CoClustering
from surprise.prediction_algorithms import predictions

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from helpersCoClustering import*

# Load dataset and samples into a pandas data frame
DATA_TRAIN_PATH = 'data/data_train.csv'
data_np = load_data(DATA_TRAIN_PATH)


DATA_TEST_PATH = 'data/sampleSubmission.csv'
samples = load_data(DATA_TEST_PATH)

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_np[['user_id', 'movie_id', 'rating']], reader=reader)

## Basic Co-Clustering
ID : 25590
- RMSE : 1.050
- Secondary: 0.118

In [4]:
trainset = data.build_full_trainset()
basiccocltr_algo = CoClustering().fit(trainset)

## Parameters test
ID : 25685
- RMSE : 1.042
- Secondary: 0.131

In [10]:
trainset = data.build_full_trainset()
cocltr_algo = CoClustering(n_cltr_u=2, n_cltr_i=19).fit(trainset)

## With data preprocessing
ID : 25761
- RMSE : 1.042
- Secondary: 0.131

In [14]:
f = ['count','mean']

df_movie_summary = data_np.groupby('movie_id')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.90),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index


df_cust_summary = data_np.groupby('user_id')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.90),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

data_np = data_np[~data_np['movie_id'].isin(drop_movie_list)]

In [15]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_np[['user_id', 'movie_id', 'rating']], reader=reader)

In [22]:
trainset = data.build_full_trainset()
cocltrpp_algo = CoClustering(n_cltr_u=2, n_cltr_i=17).fit(trainset)

## Grid Search

In [19]:
param_grid = {'n_cltr_u': [2], 'n_cltr_i': [17]}

gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv=3, n_jobs = 5)

In [20]:
# Do the grid search
gs.fit(data)

In [21]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0063167085682592
{'n_cltr_u': 2, 'n_cltr_i': 17}


## Create submission

In [6]:
samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: basiccocltr_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_basiccocltr.csv"
create_csv(PATHOUT,samples)

In [11]:
samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: cocltr_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_cocltr.csv"
create_csv(PATHOUT,samples)

In [23]:
samples['rating'] = samples[['user_id', 'movie_id']] \
.apply(lambda row: cocltrpp_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)

samples['rating'] = samples['rating'].apply(round)

PATHOUT = "Predictions_cocltrpp.csv"
create_csv(PATHOUT,samples)