# Recommender system

## 1. Prerequiste

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow
import tensorflow as tf
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import pandas as pd
from itertools import groupby


%load_ext autoreload
%autoreload 2

In [2]:
from helpersbaseline import*
from preprocessing import*

In [3]:
DATA_TRAIN_PATH = "data/data_train.csv"
data = load_data(DATA_TRAIN_PATH)

number of users : 10000, number of items: 1000


## 2. Split the data

For reproductibility, remove seed from split_data

In [19]:
valid_ratings, train, test = split_data(data, min_num_ratings=0, p_test=0.2)

Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:963903
Total number of nonzero elements in test data:213049


## 3. Implementing Baseline

In [28]:
from ImplementingBaseline import*

In [29]:
baseline_global_mean(train, test)

test of baseline using the global mean: RMSE = [[1.11845219]].


In [30]:
user_train_mean = baseline_user_mean(train, test)

test RMSE of the baseline using the user mean: [[1.03154921]].


In [32]:
baseline_item_mean(train, test)

test RMSE of the baseline using the item mean: [[1.09518532]].


## 4. Matrix Factorization using SGD

In [33]:
from factorization_SGD import*

In [34]:
num_features = 20   # K in the lecture notes
lambda_user = 0.07
lambda_item = 0.07

user_features, item_features = matrix_factorization_SGD(train, num_features, lambda_user, lambda_item) 

In [35]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 1.004158146666524.


In [36]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [10]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on final data: {}.".format(rmse))

RMSE on test data: 0.9373349757846028.


In [51]:
prediction = user_features.T.dot(item_features)
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
submission = np.zeros((n,1))
for i in range(n):
    submission[i] = np.clip(round(prediction[data[i][1]-1][data[i][0]-1]), 1, 5)

In [52]:
from helpers import*
DATA_SUBMISSION = "data/submission_SGD_base.csv"
create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method SGD without min_num_ratings - AICrowd : 1.046 and accuracy = 0.124

## 4. Matrix Factorization using ALS

In [4]:
from factorization_ALS import*

num_features = 20   # K in the lecture notes
lambda_user = 0.081
lambda_item = 0.081
    
user_features, item_features = matrix_factorization_ALS(train, num_features, lambda_user, lambda_item)

RMSE on training set: 1.0496065966044934.
RMSE on training set: 1.0166077240441311.
RMSE on training set: 0.9801298876710286.
RMSE on training set: 0.9533215482403568.
RMSE on training set: 0.9378085290373935.
RMSE on training set: 0.9287510736385048.
RMSE on training set: 0.9229384705878788.
RMSE on training set: 0.918867739291494.
RMSE on training set: 0.9158420827722703.
RMSE on training set: 0.9135031408290509.
RMSE on training set: 0.9116458097690966.
RMSE on training set: 0.9101420536969367.
RMSE on training set: 0.9089063115392461.
RMSE on training set: 0.907878354685018.
RMSE on training set: 0.9070141549588037.
RMSE on training set: 0.9062806725576844.
RMSE on training set: 0.9056526661896035.
RMSE on training set: 0.9051106081528817.
RMSE on training set: 0.9046392458411912.
RMSE on training set: 0.9042265697055412.
RMSE on training set: 0.9038630534429476.
RMSE on training set: 0.9035410852609801.
RMSE on training set: 0.9032545375269232.
RMSE on training set: 0.902998438670

In [7]:
nz_row, nz_col = test.nonzero()
nz_test = list(zip(nz_row, nz_col))
rmse = compute_error(test, user_features, item_features, nz_test)
print("RMSE on test data: {}.".format(rmse))

RMSE on test data: 0.9911709029489721.


In [31]:
from sklearn import model_selection

data = read_txt(DATA_TRAIN_PATH)[1:]
_, data_test = model_selection.train_test_split(data, test_size=0.2, random_state=1)

In [8]:
prediction = user_features.T.dot(item_features)

from sklearn import model_selection

data = read_txt(DATA_TRAIN_PATH)[1:]
_, data_test = model_selection.train_test_split(data, test_size=0.2, random_state=1)
data_test = [deal_line(line) for line in data_test]
n = len(data_test)
prediction_test = np.zeros((n,1))
for i in range(n):
    prediction_test[i] = np.clip(prediction[data_test[i][1]-1][data_test[i][0]-1], 1, 5)

In [None]:
# DATA_SUBMISSION = "data/MF_ALS_test.csv"
# create_csv(data, submission_test, DATA_SUBMISSION)

### Create final submission

In [5]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_FINAL_PATH)
_, _, samples = condition_min_num_ratings(samples, min_num_ratings=0)

number of users : 10000, number of items: 1000


In [6]:
nz_row, nz_col = samples.nonzero()
nz_samples = list(zip(nz_row, nz_col))
rmse = compute_error(samples, user_features, item_features, nz_samples)
print("RMSE on final data: {}.".format(rmse))

RMSE on final data: 0.9402699558619569.


In [9]:
data = read_txt(DATA_FINAL_PATH)[1:]
data = [deal_line(line) for line in data]
n = len(data)
prediction_submission = np.zeros((n,1))
for i in range(n):
    prediction_submission[i] = np.clip(prediction[data[i][1]-1][data[i][0]-1], 1, 5)

In [59]:
# from helpers import*
# DATA_SUBMISSION = "data/MF_ALS_submission.csv"
# create_csv(data, submission, DATA_SUBMISSION)

Baseline dot product method ALS without min_num_ratings - AICrowd : 1.072 and accuracy = 0.081

### Create files for blending

Test file

In [41]:
from sklearn import model_selection

data = load_data_df(DATA_TRAIN_PATH)
_, submission_test = model_selection.train_test_split(data, test_size=0.2, random_state=1)
submission_test['MF_ALS_rating'] = prediction_test
submission_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,user_id,movie_id,rating,MF_ALS_rating
1014743,1219,803,5,3.901988
791571,1865,643,1,4.092153
967641,1382,771,3,3.262341
65567,3295,53,3,3.509550
622228,9259,530,4,4.212068
...,...,...,...,...
468443,2110,403,5,3.777028
1064490,9211,855,2,2.542277
1172922,7228,996,3,3.033381
327551,9827,272,5,4.098855


In [42]:
DATA_BLENDING_TEST = "data/MF_ALS_test.csv"
submission_test.to_csv(DATA_BLENDING_TEST)

Submission file

In [10]:
DATA_FINAL_PATH = "data/sampleSubmission.csv"
submission_samples = load_data_df(DATA_FINAL_PATH)
submission_samples['MF_ALS_rating'] = prediction_submission
submission_samples

Unnamed: 0,user_id,movie_id,rating,MF_ALS_rating
0,37,1,3,3.295094
1,73,1,3,3.006998
2,156,1,3,3.744199
3,160,1,3,3.298078
4,248,1,3,3.257229
...,...,...,...,...
1176947,9974,1000,3,3.409465
1176948,9977,1000,3,3.631472
1176949,9978,1000,3,2.841713
1176950,9982,1000,3,3.075880


In [11]:
DATA_BLENDING_SUBMISSION = "data/MF_ALS_submission.csv"
submission_samples.to_csv(DATA_BLENDING_SUBMISSION)