# MACHINE LEARNING - PROJECT 1

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [2]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 8.44 s, sys: 556 ms, total: 8.99 s
Wall time: 9.15 s


## FEATURES ENGINEERING & DATA PROCESSING

In [3]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [4]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [5]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


In [6]:
print(features_to_keep[0])
print(features_to_keep[1])
print(features_to_keep[2])
print(features_to_keep[3])

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21]
[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]


In [7]:
print(jet_groups_indices[0])
y_p = np.empty(250_000)
y_p[jet_groups_indices[0]] = -1
print(y_p)

[False False False ... False  True  True]
[ 0.  0.  0. ...  0. -1. -1.]


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [8]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 2000

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

Gamma = 0.01
    For JET_NB 0, the obtained loss is 0.39229353135155837
    For JET_NB 1, the obtained loss is 0.4132027466118129
    For JET_NB 2, the obtained loss is 0.3551679496212518
    For JET_NB 3, the obtained loss is 0.4356752189969854
Gamma = 0.001
    For JET_NB 0, the obtained loss is 0.4071890187854611
    For JET_NB 1, the obtained loss is 0.42394948381609865
    For JET_NB 2, the obtained loss is 0.3670042247722906
    For JET_NB 3, the obtained loss is 0.44420031191889164
Gamma = 0.0001
    For JET_NB 0, the obtained loss is 0.4443662318323023
    For JET_NB 1, the obtained loss is 0.4574097786010708
    For JET_NB 2, the obtained loss is 0.41216640807377397
    For JET_NB 3, the obtained loss is 0.4713233200193401
Gamma = 1e-05
    For JET_NB 0, the obtained loss is 0.49126372563414994
    For JET_NB 1, the obtained loss is 0.4930816662458995
    For JET_NB 2, the obtained loss is 0.48454918310149103
    For JET_NB 3, the obtained loss is 0.49572853378650933
CPU times

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [9]:
%%time

gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
max_iters = 1500
batch_size = 1

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

Gamma = 0.1
    For JET_NB 0, the obtained loss is 1406874.6402069617
    For JET_NB 1, the obtained loss is 8.905399313440892e+44
    For JET_NB 2, the obtained loss is 2.62722134033137e+92
    For JET_NB 3, the obtained loss is 3.6558229573636576e+99
Gamma = 0.01
    For JET_NB 0, the obtained loss is 0.46456667812924324
    For JET_NB 1, the obtained loss is 0.48520943306089004
    For JET_NB 2, the obtained loss is 0.40874983381282415
    For JET_NB 3, the obtained loss is 0.5521115844435248
Gamma = 0.001
    For JET_NB 0, the obtained loss is 0.41290183723982854
    For JET_NB 1, the obtained loss is 0.4297899798480046
    For JET_NB 2, the obtained loss is 0.3730644250016471
    For JET_NB 3, the obtained loss is 0.45520388622179947
Gamma = 0.0001
    For JET_NB 0, the obtained loss is 0.45264898655691643
    For JET_NB 1, the obtained loss is 0.4645855495261453
    For JET_NB 2, the obtained loss is 0.42308955262164477
    For JET_NB 3, the obtained loss is 0.4773245748345276
Ga

### LEAST SQUARES

In [10]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss}")

For JET_NB 0, the obtained loss is 0.3919078212020878
For JET_NB 1, the obtained loss is 0.41308946507999145
For JET_NB 2, the obtained loss is 0.3549970852299624
For JET_NB 3, the obtained loss is 0.43547554636124286


### RIDGE REGRESSION

In [26]:
%%time

seed=15
degrees=[2,3,4]
k_fold=4
lambdas = np.logspace(-4, 0, 30)

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    degree, lambda_ = cross_validation_demo_ridge(y, tx, seed, degrees, k_fold, lambdas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and lambda is {lambda_}")

    min loss for a 2 polynomial expansion feature = 0.7578018775824853
    min loss for a 3 polynomial expansion feature = 1.3711918852901765
    min loss for a 4 polynomial expansion feature = 95.74535465786285
For JET_NB 0, the obtained best degree is 2 and lambda is 0.14873521072935117
    min loss for a 2 polynomial expansion feature = 0.8269923159385287
    min loss for a 3 polynomial expansion feature = 0.8213430080153667
    min loss for a 4 polynomial expansion feature = 0.826324413690626
For JET_NB 1, the obtained best degree is 3 and lambda is 0.0001
    min loss for a 2 polynomial expansion feature = 0.8077329060928659
    min loss for a 3 polynomial expansion feature = 0.7977741061974317
    min loss for a 4 polynomial expansion feature = 0.792004171469733
For JET_NB 2, the obtained best degree is 4 and lambda is 0.01610262027560939
    min loss for a 2 polynomial expansion feature = 0.8190071169100009
    min loss for a 3 polynomial expansion feature = 0.8073636374533862
 

### LOGISTIC REGRESSION

In [None]:
%%time

max_iters=10000
gamma = 0.05

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = logistic_regression(y, tx, max_iters, gamma)
    print(f"For JET_NB {i}, the obtained loss is {loss}")

In [38]:
%%time

max_iters=10000
seed=15
gamma = 0.05
degrees=[2, 3, 4]
k_fold=4
gammas = [0.1, 0.01, 0.001, 0.0001]

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    degree, lambda_, rmse_min = cross_validation_demo_logistic(y, tx, max_iters, seed, degrees, k_fold, gammas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and lambda is {lambda_} and rsme is {rmse_min}")

    Current iteration=0, loss=51940.290828078825


  return 1.0 / (1 + np.exp(-t))


ValueError: shapes (74934,37) and (38,1) not aligned: 37 (dim 1) != 38 (dim 0)

### REGULARIZED LOGISTIC REGRESSION

In [None]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[2, 4, 7]
    k_fold=4
    max_iters=200
    lambdas = np.logspace(-4, 0, 10)
    gammas = [0.01,0.001]
    
    tuple_ = cross_validation_demo_reg_logistic(y, tX, max_iters, seed, degrees, k_fold, lambdas, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)

## Submission

In [12]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [13]:
jet_groups_indices_test = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices_test])

#remove not used features
TX_test_arr = [TX_test_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_test_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [16]:
%%time

W_arr = []
y_pred = np.empty(len(tX_test))

for idx in range(4):
    #weight, loss = least_squares_GD(Y_arr[idx], TX_arr[idx], np.zeros(TX_arr[idx].shape[1]), 10000, 0.001)
    #weight, loss = least_squares(Y_arr[idx], TX_arr[idx])
    #weight, loss = ridge_regression(Y_arr[idx], TX_arr[idx], 0.01)
    y_pred[jet_groups_indices_test[idx]] = predict_labels(weight, TX_test_arr[idx])
print(y_pred)

[-1. -1.  1. ...  1.  1. -1.]
CPU times: user 8min 3s, sys: 28.7 s, total: 8min 32s
Wall time: 2min 8s


In [17]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
Q = np.array([1, 2, 3])
l = np.array([True, False, True, False, False, True])
A = np.zeros(6)
A[l] = Q
print(A)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

### PREPARE TEST DATA

In [None]:
jet_groups_indices = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices])


In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)