# Recommender system

## 1. Prerequiste

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow
import tensorflow as tf
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
from itertools import groupby


%load_ext autoreload
%autoreload 2

In [2]:
from helpers import*
from preprocessing import*

In [3]:
DATA_TRAIN_PATH = "data/data_train.csv"
data = load_data(DATA_TRAIN_PATH)

number of users : 10000, number of items: 1000


## 2. Split the data

For reproductibility, remove seed from split_data

In [4]:
valid_ratings, train, test = split_data(data, min_num_ratings=0, p_test=0.2)

Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:963903
Total number of nonzero elements in test data:213049


## 3. Implementing Baseline

In [28]:
from ImplementingBaseline import*

In [29]:
baseline_global_mean(train, test)

test of baseline using the global mean: RMSE = [[1.11845219]].


In [30]:
user_train_mean = baseline_user_mean(train, test)

test RMSE of the baseline using the user mean: [[1.03154921]].


In [32]:
baseline_item_mean(train, test)

test RMSE of the baseline using the item mean: [[1.09518532]].


## 4. Matrix Factorization using SGD

In [33]:
from factorization_SGD import*

In [34]:
num_features = 20   # K in the lecture notes
lambda_user = 0.07
lambda_item = 0.07

user_features, item_features = matrix_factorization_SGD(train, num_features, lambda_user, lambda_item) 

In [35]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 1.004158146666524.


In [36]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [10]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on final data: {}.".format(rmse))

RMSE on test data: 0.9373349757846028.


In [51]:
prediction = user_features.T.dot(item_features)
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
submission = np.zeros((n,1))
for i in range(n):
    submission[i] = np.clip(round(prediction[data[i][1]-1][data[i][0]-1]), 1, 5)

In [52]:
from helpers import*
DATA_SUBMISSION = "data/submission_SGD_base.csv"
create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method SGD without min_num_ratings - AICrowd : 1.046 and accuracy = 0.124

## 4. Matrix Factorization using ALS

In [5]:
from helpers import*
from factorization_ALS import*

num_features = 20   # K in the lecture notes
lambda_user = 0.081
lambda_item = 0.081
    
user_features, item_features = matrix_factorization_ALS(train, num_features, lambda_user, lambda_item)

RMSE on training set: 1.0461942444412098.
RMSE on training set: 1.0052915622886536.
RMSE on training set: 0.9604959334444244.
RMSE on training set: 0.9322063675112147.
RMSE on training set: 0.9176293680582193.
RMSE on training set: 0.9089827855485251.
RMSE on training set: 0.9031774254316103.
RMSE on training set: 0.8989765751272415.
RMSE on training set: 0.8957947034586203.
RMSE on training set: 0.8933086046771442.
RMSE on training set: 0.8913200200734677.
RMSE on training set: 0.88969905180654.
RMSE on training set: 0.8883569517235577.
RMSE on training set: 0.8872312181228207.
RMSE on training set: 0.8862766964665765.
RMSE on training set: 0.8854599947732238.
RMSE on training set: 0.8847558745725553.
RMSE on training set: 0.8841448704917563.
RMSE on training set: 0.8836116930761472.
RMSE on training set: 0.8831441388408044.
RMSE on training set: 0.8827323326975046.
RMSE on training set: 0.8823681907370562.
RMSE on training set: 0.8820450311913263.
RMSE on training set: 0.881757286761

In [6]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 0.9911709029489721.


In [56]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [57]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 0.7634478766929802.


In [58]:
prediction = user_features.T.dot(item_features)
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
submission = np.zeros((n,1))
for i in range(n):
    submission[i] = np.clip(round(prediction[data[i][1]-1][data[i][0]-1]), 1, 5)

In [59]:
from helpers import*
DATA_SUBMISSION = "data/submission_ALS_base.csv"
create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method ALS without min_num_ratings - AICrowd : 1.072 and accuracy = 0.081