# Recommender system

## 1. Prerequiste

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow
import tensorflow as tf
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
from itertools import groupby


%load_ext autoreload
%autoreload 2

In [4]:
from helpers import*
from preprocessing import*

In [5]:
DATA_TRAIN_PATH = "data/data_train.csv"
data = load_data(DATA_TRAIN_PATH)

number of users : 10000, number of items: 1000


## 2. Split the data

For reproductibility, remove seed from split_data

In [6]:
valid_ratings, train, test = split_data(data, min_num_ratings=0, p_test=0.1)

Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:1065327
Total number of nonzero elements in test data:111625


## 3. Implementing Baseline

In [8]:
from ImplementingBaseline import*

In [10]:
baseline_global_mean(train, test)

test RMSE of baseline using the global mean: [[1.12152228]].


In [None]:
user_train_mean = baseline_user_mean(train, test)

In [14]:
baseline_item_mean(train, test)

test RMSE of the baseline using the item mean: [[1.09633198]].


## 4. Matrix Factorization using SGD

In [None]:
from factorization_SGD import*

In [7]:
num_features = 20   # K in the lecture notes
lambda_user = 0.05
lambda_item = 0.2

user_features, item_features = matrix_factorization_SGD(train, num_features, lambda_user, lambda_item) 

In [8]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 1.0082670439014565.


In [9]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [10]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on final data: {}.".format(rmse))

RMSE on test data: 0.9373349757846028.


In [51]:
prediction = user_features.T.dot(item_features)
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
submission = np.zeros((n,1))
for i in range(n):
    submission[i] = np.clip(round(prediction[data[i][1]-1][data[i][0]-1]), 1, 5)

In [52]:
from helpers import*
DATA_SUBMISSION = "data/submission_SGD_base.csv"
create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method SGD without min_num_ratings - AICrowd : 1.046 and accuracy = 0.124

## 4. Matrix Factorization using ALS

In [54]:
from helpers import*
from factorization_ALS import*

num_features = 25   # K in the lecture notes
lambda_user = 0.1
lambda_item = 0.7

# A tester : ces valeurs des hyperparameters
# num_features = 20   # K in the lecture notes
# lambda_user = 0.05
# lambda_item = 0.2
    
user_features, item_features = matrix_factorization_ALS(train, num_features, lambda_user, lambda_item)

RMSE on training set: 2.106428517677524.
RMSE on training set: 1.2739821017929538.
RMSE on training set: 1.1446662872670106.
RMSE on training set: 1.0934315709450497.
RMSE on training set: 1.0678553712342054.
RMSE on training set: 1.0533560622279643.
RMSE on training set: 1.0444472374055602.
RMSE on training set: 1.0386677546510814.
RMSE on training set: 1.0347731654945034.
RMSE on training set: 1.032076476383454.
RMSE on training set: 1.0301721191256068.
RMSE on training set: 1.0288077984696378.
RMSE on training set: 1.0278199677348743.
RMSE on training set: 1.0270991173242556.
RMSE on training set: 1.0265700340491097.
RMSE on training set: 1.0261800284164861.
RMSE on training set: 1.025891620978918.
RMSE on training set: 1.0256778367925297.
RMSE on training set: 1.0255190865137027.
RMSE on training set: 1.0254010470853805.
RMSE on training set: 1.0253131918988199.


In [55]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 1.03547938538604.


In [56]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [57]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 0.7634478766929802.


In [58]:
prediction = user_features.T.dot(item_features)
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
submission = np.zeros((n,1))
for i in range(n):
    submission[i] = np.clip(round(prediction[data[i][1]-1][data[i][0]-1]), 1, 5)

In [59]:
from helpers import*
DATA_SUBMISSION = "data/submission_ALS_base.csv"
create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method ALS without min_num_ratings - AICrowd : 1.072 and accuracy = 0.081