# MACHINE LEARNING - PROJECT 1

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [4]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 8.25 s, sys: 625 ms, total: 8.88 s
Wall time: 8.97 s


## FEATURES ENGINEERING & DATA PROCESSING

In [5]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [6]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [7]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [10]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 500

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    avg_loss = []
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        avg_loss.append(loss)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

print(f"\n-> THE BEST GAMMA TO CHOOSE SEEMS TO BE {gammas[np.argmin(avg_loss_per_gamma)]} \n")

Gamma = 0.01
    For JET_NB 0, the obtained loss is 0.3989867162319593
    For JET_NB 1, the obtained loss is 0.4172543921141015
    For JET_NB 2, the obtained loss is 0.35971244450865575
    For JET_NB 3, the obtained loss is 0.4395418755314808
Gamma = 0.001
    For JET_NB 0, the obtained loss is 0.4209486170953338
    For JET_NB 1, the obtained loss is 0.43812907655375566
    For JET_NB 2, the obtained loss is 0.3830215434214012
    For JET_NB 3, the obtained loss is 0.45600683886462684
Gamma = 0.0001
    For JET_NB 0, the obtained loss is 0.47987265246475413
    For JET_NB 1, the obtained loss is 0.48424585242269846
    For JET_NB 2, the obtained loss is 0.46531453678492685
    For JET_NB 3, the obtained loss is 0.49008046650885867
Gamma = 1e-05
    For JET_NB 0, the obtained loss is 0.4977224341467849
    For JET_NB 1, the obtained loss is 0.49818349111105975
    For JET_NB 2, the obtained loss is 0.4959137154992878
    For JET_NB 3, the obtained loss is 0.4988905281880708

-> THE 

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [11]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 500
batch_size = 1

avg_loss_per_gamma = []
# Iterate over some gammas to find the best possible values
for gamma in gammas:
    avg_loss = []
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        avg_loss.append(loss)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")
    avg_loss_per_gamma.append(np.mean(avg_loss))

print(f"\n-> THE BEST GAMMA TO CHOOSE SEEMS TO BE {gammas[np.argmin(avg_loss_per_gamma)]} \n")

Gamma = 0.01
    For JET_NB 0, the obtained loss is 0.45694826946743644
    For JET_NB 1, the obtained loss is 0.4901737306820593
    For JET_NB 2, the obtained loss is 0.47608662667843166
    For JET_NB 3, the obtained loss is 0.5781503319577728
Gamma = 0.001
    For JET_NB 0, the obtained loss is 0.42260153904639103
    For JET_NB 1, the obtained loss is 0.44300561545234957
    For JET_NB 2, the obtained loss is 0.383789183337023
    For JET_NB 3, the obtained loss is 0.45988551755192514
Gamma = 0.0001
    For JET_NB 0, the obtained loss is 0.4802967581122349
    For JET_NB 1, the obtained loss is 0.48439093309201486
    For JET_NB 2, the obtained loss is 0.46424161843131223
    For JET_NB 3, the obtained loss is 0.4881727534034623
Gamma = 1e-05
    For JET_NB 0, the obtained loss is 0.4978633333937563
    For JET_NB 1, the obtained loss is 0.49854792876276294
    For JET_NB 2, the obtained loss is 0.49647903417700195
    For JET_NB 3, the obtained loss is 0.498722928012306

-> THE B

### LEAST SQUARES

In [12]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss:>15}")

For JET_NB 0, the obtained loss is 0.3919078212020878
For JET_NB 1, the obtained loss is 0.41308946507999145
For JET_NB 2, the obtained loss is 0.3549970852299624
For JET_NB 3, the obtained loss is 0.43547554636124286


### RIDGE REGRESSION

In [None]:
ridge regression(y, tx, lambda )

### LOGISTIC REGRESSION

### REGULARIZED LOGISTIC REGRESSION

In [None]:
from implementations import *
seed = 12
degrees = [x for x in range(1,9)]
k_fold = 4
gammas = [0.1,0.01,0.001,0.0001]
initial_w = np.zeros(30)
max_iters = 150
tx = TX_arr[0]

cross_validation_demo_logistic(y, tX,initial_w, max_iters, seed, degrees, k_fold, gammas)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
y_pred = predict_labels(weights, tX_test)
y_pred = list()
for i in range(4):
    y_pred += predict_labels(weights, tX_test i)


create_csv_submission(ids_test, y_pred, OUTPUT_PATH)