In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from implementations import *
from losses import *
from EDA import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
DATA_TRAIN_PATH = "../data/train.csv"
data_y, data_set, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
# Fixing the seed
seed = 8

### Split and classify data

In [None]:
# Classification of the output
y_0, y_1, y_2, y_3 = y_classification(data_y, data_set)

In [None]:
#EDA for each class
class_0, class_1, class_2, class_3 = EDA_class(data_set)

In [None]:
# Set the "spliter" value correponding to the part dedicated for train
spliter = 0.8

# Split into a train and a test set
train_0, y_tr_0, test_0, y_te_0 = train_test_separator(y_0, class_0, spliter, seed)
train_1, y_tr_1, test_1, y_te_1 = train_test_separator(y_1, class_1, spliter, seed)
train_2, y_tr_2, test_2, y_te_2 = train_test_separator(y_2, class_2, spliter, seed)
train_3, y_tr_3, test_3, y_te_3 = train_test_separator(y_3, class_3, spliter, seed)

In [None]:
print(train_0.shape)
print(train_1.shape)
print(train_2.shape)
print(train_3.shape)

# See some histograms

***

#### Only on class 0

In [None]:
classs = class_3
yy = y_3

In [None]:
ind_y_neg = np.array(np.where(yy == -1)[0])
ind_y_pos = np.array(np.where(yy == 1)[0])

In [None]:
for col in range(16):
    plt.figure(figsize=(8,5))
    plt.hist(classs[ind_y_neg, col], bins=100, histtype = 'step', color = 'red', density=True)
    plt.hist(classs[ind_y_pos, col], bins=100, histtype = 'step', density=True)
    plt.title('Histogram of feature 0')
    plt.show()

# Data set 1 - Exploration

***

#### Least squares method

In [None]:
MSE_loss = MSE()

In [None]:
# Computation of the MSE and teh optimal weights
mse_1, w_1 = least_squares(y_tr_1, train_1, MSE_loss)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

*** 

#### Gradient Descent method with MSE loss

In [None]:
# Train our model and get the response weight and its loss
initial_w = np.zeros(train_1.shape[1])
mse_1, w_1 = least_squares_GD(y_tr_1, train_1, MSE_loss, initial_w=initial_w, gamma=1e-1, max_iters=1000)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

***

#### Stochastic Gradient descent with MSE loss

In [None]:
# Train our model and get the response weight and its loss
initial_w = np.random.rand(train_1.shape[1])
mse_1, w_1 = least_squares_SGD(y_tr_1, train_1, MSE_loss, initial_w=initial_w, gamma=1e-2, max_iters=1000)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

***

#### Ridge regression method

In [None]:
# Computation of the MSE and teh optimal weights
mse_1, w_1 = ridge_regression(y_tr_1, train_1, MSE_loss, lambda_=1e-9)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

***

#### Logistic regression using Gradient descent method

In [None]:
neg_log = Neg_log()

In [None]:
# Computation of he MSE and teh optimal weights
initial_w = np.zeros(train_1.shape[1])
loss, w_1 = logistic_regression(y_tr_1, train_1, neg_log, initial_w=initial_w, gamma=1, max_iters=1000)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

***

#### Regularized logistic regression

In [None]:
# Computation of he MSE and teh optimal weights
initial_w = np.zeros(train_1.shape[1])
mse_1, w_1 = logistic_regression(y_tr_1, train_1, neg_log, initial_w=initial_w, gamma=0.01, max_iters=1000)

In [None]:
# Get the predictions on the train set
train_pred_1 = predict_labels(w_1, train_1)

# Get the predictions on the test set
test_pred_1 = predict_labels(w_1, test_1)

In [None]:
# Compute the final results
counting_errors(train_pred_1, y_tr_1)
counting_errors(test_pred_1, y_te_1)

In [None]:
x = np.array([1,-2, 3])
A = np.array([[3,2,4],[2,10,7]])
print(np.multiply(x,A))
print(x.shape)
print(np.sign(x).shape)

***
***

# Submit

***
#### Data set 0

In [None]:
meth = ridge_regression

# Train our model and get the response weight and its loss
initial_w = np.random.rand(class_0.shape[1])
mse_0, w_0 = meth(y_0, class_0, MSE_loss, initial_w=initial_w, gamma=1e-2, max_iters=1000)

In [None]:
# Train our model and get the response weight and its loss
initial_w = np.random.rand(class_1.shape[1])
mse_1, w_1 = meth(y_1, class_1, MSE_loss, initial_w=initial_w, gamma=1e-2, max_iters=1000)

In [None]:
# Train our model and get the response weight and its loss
initial_w = np.random.rand(class_2.shape[1])
mse_2, w_2 = meth(y_2, class_2, MSE_loss, initial_w=initial_w, gamma=1e-2, max_iters=1000)

In [None]:
# Train our model and get the response weight and its loss
initial_w = np.random.rand(class_3.shape[1])
mse_3, w_3 = meth(y_3, class_3, MSE_loss, initial_w=initial_w, gamma=1e-2, max_iters=1000)

In [None]:
# Get the predictions on the train set
y_pred_0 = predict_labels(w_0, class_0)
y_pred_1 = predict_labels(w_1, class_1)
y_pred_2 = predict_labels(w_2, class_2)
y_pred_3 = predict_labels(w_3, class_3)

In [None]:
y_pred_train = rebuild_y(y_pred_0, y_pred_1, y_pred_2, y_pred_3, data_set)
counting_errors(y_pred_train, data_y)

In [None]:
DATA_TEST_PATH = "../data/test.csv"
_, data_test_set, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
#EDA for each class
class_0, class_1, class_2, class_3 = EDA_class(data_test_set)

In [None]:
# Get the predictions on the train set
y_pred_0 = predict_labels(w_0, class_0)
y_pred_1 = predict_labels(w_1, class_1)
y_pred_2 = predict_labels(w_2, class_2)
y_pred_3 = predict_labels(w_3, class_3)

y_pred = rebuild_y(y_pred_0, y_pred_1, y_pred_2, y_pred_3, data_test_set)

In [None]:
OUTPUT_PATH = '../data/test_prediction_submission.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)