# MACHINE LEARNING - PROJECT 1

In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [19]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 9.19 s, sys: 806 ms, total: 10 s
Wall time: 10.4 s


## FEATURES ENGINEERING & DATA PROCESSING

In [20]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [23]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [6]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [None]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 500

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    avg_loss = []
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        avg_loss.append(loss)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

print(f"\n-> THE BEST GAMMA TO CHOOSE SEEMS TO BE {gammas[np.argmin(avg_loss_per_gamma)]} \n")

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [None]:
%%time

gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
max_iters = 500
batch_size = 1

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    avg_loss = []
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        avg_loss.append(loss)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

print(f"\n-> THE BEST GAMMA TO CHOOSE SEEMS TO BE {gammas[np.argmin(avg_loss_per_gamma)]} \n")

### LEAST SQUARES

In [None]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss:>15}")

### RIDGE REGRESSION

In [None]:
%%time

y=np.array(Y_arr[0])
x=np.array(TX_arr[0])
seed=15
degrees=[2, 4, 6, 7]
k_fold=4
lambdas = np.logspace(-4, 0, 30)

cross_validation_demo_ridge(y, x, seed, degrees, k_fold, lambdas)

### LOGISTIC REGRESSION

In [14]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[2, 4, 7]
    k_fold=4
    max_iters=200
    gammas = [0.01,0.001]
    
    tuple_ = cross_validation_demo_logistic(y, tX, max_iters, seed, degrees, k_fold, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)
    

group  0  tuple :  (2, 0.01)
group  1  tuple :  (2, 0.01)
group  2  tuple :  (2, 0.01)
group  3  tuple :  (4, 0.01)
CPU times: user 9min 32s, sys: 21 s, total: 9min 53s
Wall time: 2min 52s


In [15]:
%%time

params_ = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[1, 2, 3, 4]
    k_fold=4
    max_iters=200
    gammas = [0.05, 0.01,0.015]
    
    tuple_ = cross_validation_demo_logistic(y, tX, max_iters, seed, degrees, k_fold, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)
    

group  0  tuple :  (1, 0.05)
group  1  tuple :  (1, 0.05)
group  2  tuple :  (1, 0.05)
group  3  tuple :  (1, 0.05)
CPU times: user 15min, sys: 40.2 s, total: 15min 40s
Wall time: 4min 40s


### REGULARIZED LOGISTIC REGRESSION

## Generate predictions and save ouput in csv format for submission:

In [25]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

### PREPARE TEST DATA

In [27]:
jet_groups_indices = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices])

In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
y_pred = predict_labels(weights, tX_test)

create_csv_submission(ids_test, y_pred, OUTPUT_PATH)