## Project 1 
#### Imports:

In [23]:
import numpy as np
from implementations import *
from proj1_helpers import *
from cross_validation import *
from pre_processing import *
from split_jet_num import generate_4_sets_looking_on_jetnum, columns_contains_just_missing_values, columns_contains_same_value

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data set

In [2]:
""" y: class labels
    tx: features
    ids: event ids """
y, tx, ids = load_csv_data("datas/train.csv", sub_sample=True)
print("TRAIN DATAS LOADED!")

no_y, tx_test, ids_test = load_csv_data("datas/test.csv", sub_sample=True)
print("TEST DATAS LOADED!")

TRAIN DATAS LOADED!
TEST DATAS LOADED!


## Preprocessing 
### Replace missing values with mean, median  or normalize the dataset

In [379]:
# Replace missing values with mean for each feature
# train set
means = find_mean(tx)
tx_replaced_by_mean = replace_missing_values(tx, means)

# test set
means_test = find_mean(tx_test)
tx_replaced_by_mean_test = replace_missing_values(tx_test, means_test)

In [380]:
# Replace missing values with median for each feature
# train set
medians = find_median(tx)
tx_replaced_by_median = replace_missing_values(tx, medians)

# test set
medians_test = find_median(tx_test)
tx_replaced_by_median_test = replace_missing_values(tx_test, medians_test)

In [381]:
# Replace missing values with 0 and before that normalize all values without considering missing values
std_data_tx_with_mask = standardize(clean_array(tx))
tx_std_data_replaced_by_0 = replace_missing_values(std_data_tx_with_mask, np.full((30, 1), 0))

## Division of the dataset looking on jet num
If PRI_jet_num is zero or one then some features are -999.
Divide dataset in 4 looking on jet_num 0, 1, 2 and 3.

In [3]:
features_jet_0, features_jet_1, features_jet_2, features_jet_3 = generate_4_sets_looking_on_jetnum(tx)

For each set look how many missing values there are.. in order to detect how many features we want to drop!

In [19]:
# iterate to find which columns to drop
columns_to_remove_0 = columns_contains_just_missing_values(features_jet_0[0])
columns_to_remove_1 = columns_contains_just_missing_values(features_jet_1[0])
columns_to_remove_2 = columns_contains_just_missing_values(features_jet_2[0])
columns_to_remove_3 = columns_contains_just_missing_values(features_jet_3[0])

[[ 148.436   43.251  118.888 ... -999.    -999.       0.   ]
 [-999.      86.317   73.988 ... -999.    -999.       0.   ]
 [-999.      64.299   64.676 ... -999.    -999.       0.   ]
 ...
 [-999.      88.445   54.259 ... -999.    -999.       0.   ]
 [  81.153   28.748   63.335 ... -999.    -999.       0.   ]
 [-999.      78.589   76.993 ... -999.    -999.       0.   ]]
[4, 5, 6, 12, 23, 24, 25, 26, 27, 28, 22]
[4, 5, 6, 12, 26, 27, 28, 22]
[22]
[22]


We have noticed that we don't need to remove any features when jet num is 2 and 3. 


Infact: 


JET_NUM = 0 -> [4, 5, 6, 12, 23, 24, 25, 26, 27, 28]


JET_NUM = 1 -> [4, 5, 6, 12, 26, 27, 28]


JET_NUM = 2 -> []


JET_NUM = 3 -> []


We will drop features: 4, 5, 6, 12, 26, 27 and 28. And also we will drop feature 22 since it is the one of jet_num.

In [None]:
# drop feature 22 since it is the one of jet_num and it will contains the same value.
columns_to_remove_0.append(22)
columns_to_remove_1.append(22)
columns_to_remove_2.append(22)
columns_to_remove_3.append(22)

Check for constant values, if I feature contains all the same values it is not important.

In [25]:
columns_to_remove_0_b = columns_contains_same_value(features_jet_0[0])
columns_to_remove_1_b = columns_contains_same_value(features_jet_1[0])
columns_to_remove_2_b = columns_contains_same_value(features_jet_2[0])
columns_to_remove_3_b = columns_contains_same_value(features_jet_3[0])
print(columns_to_remove_0_b)
print(columns_to_remove_1_b)
print(columns_to_remove_2_b)
print(columns_to_remove_3_b)

[4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28, 29]
[4, 5, 6, 12, 22, 26, 27, 28]
[22]
[22]


We can notice that as predicted we should removed feature 22. We have found also feature 29 to remove from column 0.
Since before we have dropped columns full of -999 (same value) we can just remove these features from each set.

JET_NUM = 0 -> [4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28, 29]


JET_NUM = 1 -> [4, 5, 6, 12, 22, 26, 27, 28]


JET_NUM = 2 -> [22]


JET_NUM = 3 -> [22]

In [None]:
# remove columns from subset    
features_dropped_0 = np.delete(features_jet_0[0], columns_to_remove_0_b, axis=1)
features_dropped_1 = np.delete(features_jet_1[0], columns_to_remove_1_b, axis=1)
features_dropped_2 = np.delete(features_jet_2[0], columns_to_remove_2_b, axis=1)
features_dropped_3 = np.delete(features_jet_3[0], columns_to_remove_3_b, axis=1)

In [None]:
columns_to_remove = [4, 5, 6, 12, 22, 26, 27, 28]

tx_dropped_columns = np.delete(tx, columns_to_remove, axis=1)

## Cross validation 
K-fold cross-validation: original sample randomly partitioned into k equal sized subsamples.
Repeated k times.

In [384]:
seed = 19
degree = 7
k_fold = 5

lambdas = np.logspace(-4, 0, 30) #just for ridge regression

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

## Test ML Methods
### Least Squares

#### Results obtained using k = 5 and replacing missing value by mean value.

In [385]:
# store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_replaced_by_mean, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    
# Just for study the behaviour
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

Iteration: 0) Accuracy train: 0.804520 - Accuracy test: 0.799740 - Loss: 0.575760

Iteration: 1) Accuracy train: 0.800310 - Accuracy test: 0.802020 - Loss: 0.585639

Iteration: 2) Accuracy train: 0.806710 - Accuracy test: 0.808080 - Loss: 0.572250

Iteration: 3) Accuracy train: 0.804575 - Accuracy test: 0.800720 - Loss: 0.576161

Iteration: 4) Accuracy train: 0.803565 - Accuracy test: 0.804200 - Loss: 0.577485

Accuracy test, mean: 0.802952, min value: 0.799740, max value: 0.808080 

Accuracy train, mean: 0.803936, min value: 0.800310, max value: 0.806710 



#### Results obtained using k = 5 and replacing missing value by median value.

In [386]:
# store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_replaced_by_median, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

# Just for study the behaviour    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


Iteration: 0) Accuracy train: 0.805525 - Accuracy test: 0.801640 - Loss: 0.573839

Iteration: 1) Accuracy train: 0.804210 - Accuracy test: 0.807480 - Loss: 0.576331

Iteration: 2) Accuracy train: 0.808065 - Accuracy test: 0.809400 - Loss: 0.569957

Iteration: 3) Accuracy train: 0.805485 - Accuracy test: 0.801400 - Loss: 0.574242

Iteration: 4) Accuracy train: 0.804755 - Accuracy test: 0.805380 - Loss: 0.575564

Accuracy test, mean: 0.805060, min value: 0.801400, max value: 0.809400 

Accuracy train, mean: 0.805608, min value: 0.804210, max value: 0.808065 



#### Results obtained using k = 5 and replacing missing value by 0 after having normalize

In [None]:
accuracy_train = []
accuracy_test = []
losses = []
weights = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_std_data_replaced_by_0, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

# Just for study    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


### Ridge regression using normal equations

#### Results obtained using k = 5 and replacing missing value by mean value.

In [None]:
accuracy_train = []
accuracy_test = []
losses = []

for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_replaced_by_mean, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))

# Just for study    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

#### Results obtained using k = 5 and replacing missing value by median value.

In [None]:
accuracy_train = []
accuracy_test = []
losses = []


accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_replaced_by_median, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))
    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

In [346]:
accuracy_train = []
accuracy_test = []
losses = []
weights = []

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_dropped, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


Iteration: 0) Accuracy train: 0.654965 - Accuracy test: 0.653340 - Loss: 2.902052

Iteration: 1) Accuracy train: 0.668780 - Accuracy test: 0.670180 - Loss: 125.230677

Iteration: 2) Accuracy train: 0.785245 - Accuracy test: 0.786400 - Loss: 0.622517

Iteration: 3) Accuracy train: 0.764410 - Accuracy test: 0.762780 - Loss: 0.677012

Iteration: 4) Accuracy train: 0.777095 - Accuracy test: 0.777480 - Loss: 0.644077

Accuracy test, mean: 0.730036, min value: 0.653340, max value: 0.786400 

Accuracy train, mean: 0.730099, min value: 0.654965, max value: 0.785245 



### Generate Prediction (example)

In [None]:
y_test_predicted = []
test_poly = build_poly(tx_median_test, degree)
y_test_predicted = predict_labels(weights[0], test_poly)
create_csv_submission(ids_test, y_test_predicted, "submission-7")

In [None]:
features_dropped.shape

In [None]:
y.shape

In [None]:
tx_dropped.shape

In [None]:
y_dropped.shape

In [None]:
ids_dropped.shape

In [None]:
print(features_dropped_0.shape)
print(features_dropped_1.shape)
print(features_dropped_2.shape)
print(features_dropped_3.shape)

print(columns_to_remove_0)
print(columns_to_remove_1)
print(columns_to_remove_2)
print(columns_to_remove_3)