# MACHINE LEARNING - PROJECT 1

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [2]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 9.33 s, sys: 689 ms, total: 10 s
Wall time: 10.2 s


## FEATURES ENGINEERING & DATA PROCESSING

In [3]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [4]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

# Store the computed medians of the training set
# to be used in the data set when substituing UNDEFINED_VALUES
train_medians = []
for group_idx in range(NUMBER_GROUPS):
    #computing median values for each column (excluding undefined values)
    medians = np.apply_along_axis(lambda v: np.median(v[v!=UNDEFINED_VALUE]), 0, TX_arr[group_idx])
    train_medians.append(medians)
    #substituting median instead of undefined values
    for col_num in range(TX_arr[group_idx].shape[1]):
        column = TX_arr[group_idx][:, col_num]
        column[column == UNDEFINED_VALUE] = medians[col_num]
    print(f"undefined values in dataset {group_idx} ? ", UNDEFINED_VALUE in TX_arr[group_idx])

#standardizing the data
#TX_arr = [standardize(TX_arr[idx]) for idx in range(NUMBER_GROUPS)]

undefined values in dataset 0 ?  False
undefined values in dataset 1 ?  False
undefined values in dataset 2 ?  False
undefined values in dataset 3 ?  False


In [5]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


In [6]:
print(features_to_keep[0])
print(features_to_keep[1])
print(features_to_keep[2])
print(features_to_keep[3])

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21]
[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [None]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 2000

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [None]:
%%time

gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
max_iters = 1500
batch_size = 1

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

### LEAST SQUARES

In [None]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss}")

### RIDGE REGRESSION

In [None]:
%%time

seed=15
degrees=[2,3,4]
k_fold=4
lambdas = np.logspace(-4, 0, 30)

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    degree, lambda_ = cross_validation_demo_ridge(y, tx, seed, degrees, k_fold, lambdas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and lambda is {lambda_}")

### LOGISTIC REGRESSION

In [None]:
%%time

max_iters = 5_000
seed = 15
degrees = [2, 3, 4, 6, 7]
k_fold = 4
gammas = [0.1, 0.01, 0.001]



for i in range(NUMBER_GROUPS):
    y = Y_arr[i]
    y[y == - 1.0] = 0.0
    tx = TX_arr[i]
    degree, gamma, rmse_min = cross_validation_demo_logistic(y, tx, max_iters, seed, degrees, k_fold, gammas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and gamma is {gamma} and loss is {rmse_min}")

### REGULARIZED LOGISTIC REGRESSION

In [None]:
%%time

seed=15
degrees=[1, 2, 6]
k_fold=4
max_iters=2_000
lambdas = [0.01,0.001]
gammas = np.logspace(-6, -2, 3)
    
params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    y[y == - 1.0] = 0.0
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    tuple_ = cross_validation_demo_reg_logistic(y, tX, max_iters, seed, degrees, k_fold, lambdas, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)

    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[1511929.40313865]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[3654959.14855465]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[4322219.19095973]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[942249.14302433]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[735641.79405394]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[903182.79192065]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[897420.65745319]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[617667.6218758]]


In [1]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

nbr_samples = tX.shape[0]
nbr_samples_with_undefined_features = 0
nbr_undefined_features = 0
nbr_features = tX.shape[0] * tX.shape[1]

for line_idx in range(tX.shape[0]):
    contains_undefined_feature = False
    for column_idx in range(len(tX[line_idx])):
        if tX[line_idx][column_idx] == UNDEFINED_VALUE:
            contains_undefined_feature = True
            nbr_undefined_features += 1
    if contains_undefined_feature:
          nbr_samples_with_undefined_features += 1

print(f"total number of samples : {nbr_samples}")
print(f"number of samples with undefined features : {nbr_samples_with_undefined_features}")
print(f"proportion of samples with undefined features : {nbr_samples_with_undefined_features / nbr_samples}")

print(f"\ntotal number of features : {nbr_features}")
print(f"total number undefined features : {nbr_undefined_features}")
print(f"proportion of undefined features : {nbr_undefined_features / nbr_features }")

NameError: name 'load_csv_data' is not defined