In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt

from helpers import *
from implementations import *
from plots import *

%load_ext autoreload
%autoreload 2

## Load the Data
Note that `ratings` is a sparse matrix that in the shape of (num_items, num_users)

In [None]:
path_dataset = "data_train.csv"
ratings = load_data(path_dataset)
ratings.shape

## Exploratory Data Analysis

### Calculate mean and std per user

In [None]:
user_means, user_stds = calculate_statistics_per_user(ratings)

In [None]:
plot_mean_and_std_per_user(user_means, user_stds)
print("Total number of nonzero elements in origial data:{v}".format(v=ratings.nnz))

### Plot the number of ratings per movie and user

In [None]:
num_items_per_user, num_users_per_item = plot_raw_data(ratings)

print("min # of items per user = {}, min # of users per item = {}.".format(
        min(num_items_per_user), min(num_users_per_item)))

## Spliting the data into a train and test set

In [None]:
valid_ratings, train, test = split_data(
    ratings, num_items_per_user, num_users_per_item, min_num_ratings=10, p_test=0)
#plot_train_test_data(train, test)

## Implementing Baselines 

### Use the global mean to do the prediction

In [None]:
baseline_global_mean(train, test)

### Use the user means as the prediction

In [None]:
baseline_user_mean(train, test)

### Use the item means as the prediction

In [None]:
baseline_item_mean(train, test)

## Learning the Matrix Factorization using SGD

In [None]:
# Hyperparameters
gamma = 0.01
num_features = 20   # K in the lecture notes
lambda_user = 0.1
lambda_item = 0.7
num_epochs = 2     # number of full passes through the train set

#item_features, user_features = matrix_factorization_SGD(train, test, gamma, num_features, lambda_user, lambda_item, num_epochs)
implementation_SGD(train, gamma, num_features, lambda_user, lambda_item, num_epochs, submission)



## Learning the Matrix Factorization using Alternating Least Squares

In [None]:
# Hyperparameters
num_features = 20   # K in the lecture notes
lambda_user = 0.1
lambda_item = 0.7
stop_criterion = 1e+4

#tune_matrix_factorization_ALS(train, test, num_features, lambda_user, lambda_item, stop_criterion)

### Submission

In [None]:
path_dataset = "data_train.csv"
ratings = load_data(path_dataset)

path_submission = "sample_submission.csv"
submission = load_data(path_submission)

In [None]:
# Hyperparameters
gamma = 0.01
num_features = 20   # K in the lecture notes
lambda_user = 0.1
lambda_item = 0.7
num_epochs = 2     # number of full passes through the train set

pred_sgd = implementation_SGD(ratings, gamma, num_features, lambda_user, lambda_item, num_epochs, submission)

In [None]:
# Hyperparameters
num_features = 20   # K in the lecture notes
lambda_user = 0.1
lambda_item = 0.7
stop_criterion = 1e-4

pred_als = implementation_ALS(ratings, num_features, lambda_user, lambda_item, stop_criterion, submission)

In [None]:
implementation_global_mean(ratings, submission)

In [None]:
implementation_user_mean(ratings, submission)

In [None]:
implementation_item_mean(ratings, submission)