# MACHINE LEARNING - PROJECT 1

In [18]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## LOAD THE TRAINING DATA INTO FEATURE MATRIX, CLASS LABELS and EVENT IDS

In [19]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 8.28 s, sys: 487 ms, total: 8.76 s
Wall time: 8.84 s


## FEATURES ENGINEERING & DATA PROCESSING

In [20]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.
print_results = True

In [21]:
#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

In [22]:
# Print the remaining number of features for each JET NUMBER
for i in PRI_JET_NUM_VALUES:
    print(f"Number of features for group {i} : ", len(features_to_keep[i]))

Number of features for group 0 :  18
Number of features for group 1 :  22
Number of features for group 2 :  29
Number of features for group 3 :  29


In [23]:
print(features_to_keep[0])
print(features_to_keep[1])
print(features_to_keep[2])
print(features_to_keep[3])

[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21]
[0, 1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29]


In [24]:
print(jet_groups_indices[0])
y_p = np.empty(250_000)
y_p[jet_groups_indices[0]] = -1
print(y_p)

[False False False ... False  True  True]
[ 0.  0.  0. ...  0. -1. -1.]


## Train Data with different models

### LEAST SQUARES WITH GRADIENT DESCENT

In [None]:
%%time

gammas = [0.01, 0.001, 0.0001, 0.00001]
max_iters = 2000

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

### LEAST SQUARES  WITH STOCHASTIC GRADIENT DESCENT

In [None]:
%%time

gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
max_iters = 1500
batch_size = 1

# Iterate over some gammas to find the best possible values
for gamma in gammas:
    print(f"Gamma = {gamma}")
    
    # Iterate over all jet numbers dataframes
    for i in range(len(features_to_keep)):
        initial_w = np.zeros(len(features_to_keep[i]))
        y = Y_arr[i]
        tx = TX_arr[i]
        weights, loss = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        print(f"    For JET_NB {i}, the obtained loss is {loss:>15}")

### LEAST SQUARES

In [None]:
for i in range(len(features_to_keep)):
    initial_w = np.zeros(len(features_to_keep[i]))
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = least_squares(y, tx)
    print(f"For JET_NB {i}, the obtained loss is {loss}")

### RIDGE REGRESSION

In [None]:
%%time

seed=15
degrees=[2,3,4]
k_fold=4
lambdas = np.logspace(-4, 0, 30)

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    degree, lambda_ = cross_validation_demo_ridge(y, tx, seed, degrees, k_fold, lambdas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and lambda is {lambda_}")

### LOGISTIC REGRESSION

In [None]:
%%time

max_iters=10000
gamma = 0.05

for i in range(len(features_to_keep)):
    y = Y_arr[i]
    tx = TX_arr[i]
    weights, loss = logistic_regression(y, tx, max_iters, gamma)
    print(f"For JET_NB {i}, the obtained loss is {loss}")

In [None]:
%%time

max_iters=1000
seed=15
degrees=[2, 3, 4, 6, 7]
k_fold= 4
#gammas = [0.1, 0.01, 0.001, 0.0001]
gammas = [0.1, 0.01]



for i in range(NUMBER_GROUPS):
    y = Y_arr[i]
    y[y == - 1.0] = 0.0
    tx = TX_arr[i]
    degree, gamma, rmse_min = cross_validation_demo_logistic(y, tx, max_iters, seed, degrees, k_fold, gammas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and gamma is {gamma} and loss is {rmse_min}")

In [None]:
%%time

max_iters = 1_000
seed = 15
degrees = [2, 3, 4, 6, 7]
k_fold = 4
#gammas = [0.1, 0.01, 0.001, 0.0001]
gammas = [0.1, 0.01, 0.001]



for i in range(NUMBER_GROUPS):
    y = Y_arr[i]
    y[y == - 1.0] = 0.0
    tx = TX_arr[i]
    degree, gamma, rmse_min = cross_validation_demo_logistic(y, tx, max_iters, seed, degrees, k_fold, gammas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and gamma is {gamma} and loss is {rmse_min}")

In [None]:
%%time

max_iters = 2_000
seed = 15
degrees = [1, 2, 3, 4, 5]
k_fold = 4
#gammas = [0.1, 0.01, 0.001, 0.0001]
gammas = [0.1, 0.01, 0.001]



for i in range(NUMBER_GROUPS):
    y = Y_arr[i]
    y[y == - 1.0] = 0.0
    tx = TX_arr[i]
    degree, gamma, rmse_min = cross_validation_demo_logistic(y, tx, max_iters, seed, degrees, k_fold, gammas)
    print(f"For JET_NB {i}, the obtained best degree is {degree} and gamma is {gamma} and loss is {rmse_min}")

### REGULARIZED LOGISTIC REGRESSION

In [None]:
%%time

params = []

for idx in range(4):
    y=np.array(Y_arr[idx])
    tX=np.array(TX_arr[idx])
    initial_w = np.zeros(len(features_to_keep[idx]))
    seed=15
    degrees=[2, 4, 7]
    k_fold=4
    max_iters=200
    lambdas = np.logspace(-4, 0, 10)
    gammas = [0.01,0.001]
    
    tuple_ = cross_validation_demo_reg_logistic(y, tX, max_iters, seed, degrees, k_fold, lambdas, gammas)
    params.append(tuple_)
    print("group ",idx, " tuple : ", tuple_)

## Submission

In [3]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [4]:
jet_groups_indices_test = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices_test])

#remove not used features
TX_test_arr = [TX_test_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_test_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

NameError: name 'PRI_JET_NUM_VALUES' is not defined

In [None]:
%%time

W_arr = []
y_pred = np.empty(len(tX_test))

for idx in range(4):
    #weight, loss = least_squares_GD(Y_arr[idx], TX_arr[idx], np.zeros(TX_arr[idx].shape[1]), 10000, 0.001)
    #weight, loss = least_squares(Y_arr[idx], TX_arr[idx])
    #weight, loss = ridge_regression(Y_arr[idx], TX_arr[idx], 0.01)
    y_pred[jet_groups_indices_test[idx]] = predict_labels(weight, TX_test_arr[idx])
print(y_pred)

In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
Q = np.array([1, 2, 3])
l = np.array([True, False, True, False, False, True])
A = np.zeros(6)
A[l] = Q
print(A)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

### PREPARE TEST DATA

In [None]:
jet_groups_indices = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices])


In [None]:
# TODO: fill in desired name of output file for submission
OUTPUT_PATH = '../data/sample-submission.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## TEST LOG

In [25]:
%%time
# Load train data and supply path
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

CPU times: user 7.67 s, sys: 437 ms, total: 8.11 s
Wall time: 8.14 s


In [26]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.

#seperating the data within the four groups
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)

#loading the test data and creating the groups
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
jet_groups_indices_test = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices_test])

#remove unused features
TX_test_arr = [TX_test_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#standardizing the data
for tx in TX_test_arr:
    tx -= np.mean(tx, axis = 0)
    tx /= np.std(tx, axis = 0)
    
#learning the weights and predicting for each group
LAMBDA_IDX = 1
DEGREE_IDX = 0
W_arr = []
y_pred = np.empty(len(tX_test))
MAX_ITER = 10_000
PARAM_arr = [[1, 0.01], [1, 0.001], [1, 0.01], [1, 0.01]]


In [30]:
for group_idx in range(NUMBER_GROUPS):
    polynom_degree = PARAM_arr[group_idx][DEGREE_IDX]
    lambda_ = PARAM_arr[group_idx][LAMBDA_IDX]
    tx = build_poly(TX_arr[group_idx], polynom_degree)# if polynom_degree > 1 else TX_arr[group_idx]
    tx_test = build_poly(TX_test_arr[group_idx], polynom_degree)
    initial_w = np.zeros((tx.shape[1], 1))
    #weight, loss = reg_logistic_regression(Y_arr[group_idx], tx, lambda_, initial_w, MAX_ITER, 0.01)
    weight, loss = logistic_regression(Y_arr[group_idx], tx, initial_w, MAX_ITER, lambda_)
    y_pred[jet_groups_indices_test[group_idx]] = predict_labels(weight, tx_test).flatten()


    Current iteration=0, loss=69254.4142512852


  return 1.0 / (1 + np.exp(-t))


    Current iteration=1000, loss=-2312200.7538944185
    Current iteration=2000, loss=-2313355.711338518
    Current iteration=3000, loss=-2314675.651435228
    Current iteration=4000, loss=-2314997.0795819215
    Current iteration=5000, loss=-2315385.1256276676
    Current iteration=6000, loss=-2315184.8183174855
    Current iteration=7000, loss=-2315710.9286825177


KeyboardInterrupt: 

In [39]:
group_idx = 0
polynom_degree = PARAM_arr[group_idx][DEGREE_IDX]
lambda_ = PARAM_arr[group_idx][LAMBDA_IDX]
tx = build_poly(TX_arr[group_idx], polynom_degree)# if polynom_degree > 1 else TX_arr[group_idx]
tx_test = build_poly(TX_test_arr[group_idx], polynom_degree)
initial_w = np.zeros((tx.shape[1], 1))
weight, loss = reg_logistic_regression(Y_arr[group_idx], tx, lambda_, initial_w, MAX_ITER, 0.0000000001)
y_pred[jet_groups_indices_test[group_idx]] = predict_labels(weight, tx_test).flatten()

    Current iteration=0, loss=[69254.41425129]
    Current iteration=1000, loss=[-36638.62359221]
    Current iteration=2000, loss=[-108698.36971654]
    Current iteration=3000, loss=[-170150.25606987]
    Current iteration=4000, loss=[-227251.09474213]
    Current iteration=5000, loss=[-282233.64874597]
    Current iteration=6000, loss=[-336038.97613555]
    Current iteration=7000, loss=[-389138.73879306]
    Current iteration=8000, loss=[-441741.18141261]
    Current iteration=9000, loss=[-494001.09131587]


KeyboardInterrupt: 

In [None]:
#creating csv file
OUTPUT_PATH = '../data/sample-submission_log_reg.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [11]:
a = np.array([1,2,3,4,5])
print(a)

b=a.reshape((5,1))
print(b)

c=np.array([2,2,2,2,2])
print(c)

[1 2 3 4 5]
[[1]
 [2]
 [3]
 [4]
 [5]]
[2 2 2 2 2]


In [16]:
c.dot(b)

array([30])