## Project 1 
#### Imports:

In [340]:
import numpy as np
from implementations import *
from proj1_helpers import *
from cross_validation import *
from pre_processing import *
from split_jet_num import generate_4_sets_looking_on_jetnum

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data set

In [341]:
""" y: class labels
    tx: features
    ids: event ids """
y, tx, ids = load_csv_data("datas/train.csv", sub_sample=False)
print(y)
print(tx)
print(ids)


[ 1. -1. -1. ...  1. -1. -1.]
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
[100000 100001 100002 ... 349997 349998 349999]


## Preprocessing 
### Replace missing values with mean and median  

In [334]:
mean_array = find_mean(tx)
median_array = find_median(tx)
tx_mean = replace_missing_values(tx, mean_array)
tx_median = replace_missing_values(tx, median_array)

std_data_tx_with_mask = standardize(clean_array(tx))

std_data = replace_missing_values(std_data_tx_with_mask, np.full((30, 1), 0))


no_y, tx_test, ids_test = load_csv_data("datas/test.csv", sub_sample=False)
mean_array_test = find_mean(tx_test)
median_array_test = find_median(tx_test)
tx_mean_test = replace_missing_values(tx_test, mean_array_test)
tx_median_test = replace_missing_values(tx_test, median_array_test)

## Cross validation 
K-fold cross-validation: original sample randomly partitioned into k equal sized subsamples.
Repeated k times.

In [335]:
seed = 19
degree = 7
k_fold = 5

lambdas = np.logspace(-4, 0, 30) #just for ridge regression

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

## Test ML Methods
### Least Squares

In [336]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
y_test_predicted = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test, y_t_p = cross_validation(y, tx_mean, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    y_test_predicted.append(y_t_p)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_median, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE



ValueError: not enough values to unpack (expected 5, got 4)

In [None]:
## SUBMIT PREDICTION
y_test_predicted = []

test_poly = build_poly(tx_median_test, degree)

y_test_predicted = predict_labels(weights[0], test_poly)
print(y_test_predicted.shape)
print(y_test_predicted)
print(ids_test.shape)

create_csv_submission(ids_test, y_test_predicted, "submission-7")

### Ridge regression using normal equations

In [None]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data

accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_mean, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))

    
  
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []


accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_median, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))
    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


### Least Squares with normalization

In [None]:
# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, std_data, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


## Division of the dataset looking on jet num
If PRI_jet_num is zero or one then some features are -999.
Divide dataset in 4 looking on jet_num 0, 1, 2 and 3.

In [343]:
features_jet_0, features_jet_1, features_jet_2, features_jet_3 = generate_4_sets_looking_on_jetnum(tx)

For each set look how many missing values there are.. in order to detect how many features we want to drop!

In [345]:
# iterate to find which columns to drop
columns_to_remove_0 = []
columns_to_remove_1 = []
columns_to_remove_2 = []
columns_to_remove_3 = []

features_dropped = features_jet_0[0]
for i in range (0, 30):
    if(np.all(features_jet_0[0][:, i] == -999.)):
        columns_to_remove_0.append(i)    
    if(np.all(features_jet_1[0][:, i] == -999.)):
        columns_to_remove_1.append(i) 
    if(np.all(features_jet_2[0][:, i] == -999.)):
         columns_to_remove_2.append(i)
    if(np.all(features_jet_3[0][:, i] == -999.)):
         columns_to_remove_3.append(i) 
        
features_dropped_0 = np.delete(features_jet_0[0], columns_to_remove_0, axis=1)
features_dropped_1 = np.delete(features_jet_1[0], columns_to_remove_1, axis=1)
features_dropped_2 = np.delete(features_jet_2[0], columns_to_remove_2, axis=1)
features_dropped_3 = np.delete(features_jet_3[0], columns_to_remove_3, axis=1)

We have noticed that we don't need to remove any features when jet num is 2 and 3. 


Infact: 


JET_NUM = 0 -> [4, 5, 6, 12, 23, 24, 25, 26, 27, 28]


JET_NUM = 1 -> [4, 5, 6, 12, 26, 27, 28]


JET_NUM = 2 -> []


JET_NUM = 3 -> []


We will drop features: 4, 5, 6, 12, 26, 27 and 28.

In [346]:
accuracy_train = []
accuracy_test = []
losses = []
weights = []

columns_to_remove = [4, 5, 6, 12, 26, 27, 28]

tx_dropped = np.delete(tx, columns_to_remove, axis=1)

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_dropped, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


Iteration: 0) Accuracy train: 0.654965 - Accuracy test: 0.653340 - Loss: 2.902052

Iteration: 1) Accuracy train: 0.668780 - Accuracy test: 0.670180 - Loss: 125.230677

Iteration: 2) Accuracy train: 0.785245 - Accuracy test: 0.786400 - Loss: 0.622517

Iteration: 3) Accuracy train: 0.764410 - Accuracy test: 0.762780 - Loss: 0.677012

Iteration: 4) Accuracy train: 0.777095 - Accuracy test: 0.777480 - Loss: 0.644077

Accuracy test, mean: 0.730036, min value: 0.653340, max value: 0.786400 

Accuracy train, mean: 0.730099, min value: 0.654965, max value: 0.785245 



In [313]:
features_dropped.shape

(250000, 30)

In [305]:
y.shape

(250000,)

In [323]:
tx_dropped.shape

(250000, 23)

In [322]:
y_dropped.shape

(249993,)

In [326]:
ids_dropped.shape

(249993,)

In [320]:
print(features_dropped_0.shape)
print(features_dropped_1.shape)
print(features_dropped_2.shape)
print(features_dropped_3.shape)

print(columns_to_remove_0)
print(columns_to_remove_1)
print(columns_to_remove_2)
print(columns_to_remove_3)

(99913, 20)
(77544, 23)
(50379, 30)
(22164, 30)
[4, 5, 6, 12, 23, 24, 25, 26, 27, 28]
[4, 5, 6, 12, 26, 27, 28]
[]
[]
