# MACHINE LEARNING - PROJECT 1

In [19]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [2]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 8.06 s, sys: 529 ms, total: 8.59 s
Wall time: 8.73 s


## FEATURES ENGINEERING & DATA PROCESSING

In [3]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [4]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [5]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [None]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 500

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [None]:
%%time

gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
max_iters = 500
batch_size = 1

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

### LEAST SQUARES

In [8]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss:>15}")

For JET_NB 0, the obtained loss is 0.3919078212020878
For JET_NB 1, the obtained loss is 0.41308946507999145
For JET_NB 2, the obtained loss is 0.3549970852299624
For JET_NB 3, the obtained loss is 0.43547554636124286


### RIDGE REGRESSION

In [14]:
%%time

y=np.array(Y_arr[0])
x=np.array(TX_arr[0])
seed=15
degrees=[2, 4, 6, 7]
k_fold=4
lambdas = np.logspace(-4, 0, 30)

cross_validation_demo_ridge(y, x, seed, degrees, k_fold, lambdas)

min loss for 2 polynomial = 0.7578018775824853
min loss for 4 polynomial = 95.74535465786285
min loss for 6 polynomial = 390218.8905628959
min loss for 7 polynomial = 65755243.057918
##### [0.7578018775824853, 95.74535465786285, 390218.8905628959, 65755243.057918] ####
CPU times: user 7min 13s, sys: 32.6 s, total: 7min 46s
Wall time: 3min 6s


(2, 0.14873521072935117)

### LOGISTIC REGRESSION

In [None]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[2, 3, 4]
    k_fold=4
    max_iters=200
    gammas = [0.01,0.001]
    
    tuple_ = cross_validation_demo_logistic(y, tX, max_iters, seed, degrees, k_fold, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)
    

In [25]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[1, 2, 3, 4]
    k_fold=4
    max_iters=200
    gammas = [0.01,0.015]
    
    tuple_ = cross_validation_demo_logistic(y, tX, max_iters, seed, degrees, k_fold, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)
    

[126850.73014332053, 125731.85969884906, 125438.71946844856, 126613.72846460227]
[190276.35249969005, 188598.08450043623, 188158.33075151406, 189920.84357057163]
[23116901.55446681, 1914294.5624326975, 2259861.5372005464, 1868223.7580463174]
[34675352.34658103, 2871442.045282644, 3389792.474848621, 2802335.8437517267]
[2735377863.016903, 58140841.380610526, 141802071.09487492, 57451676.39932727]
[4103066881.0090804, 87211262.25061019, 212703106.65014157, 86177515.53485338]
[948209430988.16, 22208038397.88533, 100515305890.00699, 26305268937.433353]
[1422314146079.2725, 33312057595.276367, 150772958819.39117, 39457903405.9801]
##### [126158.7594438051, 7289820.353036594, 748193112.9729289, 274309511053.37143] ####
group  0  tuple :  (1, 0.01, 126158.7594438051)
[70092.8564155647, 70002.94863867326, 70186.04892550093, 70793.07619647379]
[105139.50126660198, 105004.63995335648, 105279.35809053139, 106189.96893680224]
[1151011.4052866239, 1103017.1157170166, 1132255.2595157586, 1226181.254

### REGULARIZED LOGISTIC REGRESSION

In [None]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[2, 4, 7]
    k_fold=4
    max_iters=200
    lambdas = np.logspace(-4, 0, 10)
    gammas = [0.01,0.001]
    
    tuple_ = cross_validation_demo_reg_logistic(y, tX, max_iters, seed, degrees, k_fold, lambdas, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

### PREPARE TEST DATA

In [None]:
jet_groups_indices = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices])

In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
y_pred = predict_labels(weights, tX_test)

create_csv_submission(ids_test, y_pred, OUTPUT_PATH)