## Project 1 
#### Imports:

In [28]:
import numpy as np
from implementations import *
from proj1_helpers import *
from cross_validation import *
from pre_processing import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data set

In [46]:
""" y: class labels
    tx: features
    ids: event ids """
y, tx, ids = load_csv_data("datas/train.csv", sub_sample=False)
print(y)
print(tx)
print(ids)
# tx_test, ids_test = load_csv_data("all_data/test.csv", sub_sample=False)

[ 1. -1. -1. ...  1. -1. -1.]
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
[100000 100001 100002 ... 349997 349998 349999]


## Preprocessing 
### Replace missing values with mean and median  

In [30]:
mean_array = find_mean(tx)
median_array = find_median(tx)
tx_mean = replace_missing_values(tx, mean_array)
tx_median = replace_missing_values(tx, median_array)

std_data_tx_with_mask = standardize(clean_array(tx))

std_data = replace_missing_values(std_data_tx_with_mask, np.full((30, 1), 0))


no_y, tx_test, ids_test = load_csv_data("datas/test.csv", sub_sample=False)
mean_array_test = find_mean(tx_test)
median_array_test = find_median(tx_test)
tx_mean_test = replace_missing_values(tx_test, mean_array_test)
tx_median_test = replace_missing_values(tx_test, median_array_test)

## Cross validation 
K-fold cross-validation: original sample randomly partitioned into k equal sized subsamples.
Repeated k times.

In [31]:
seed = 19
degree = 7
k_fold = 5

lambdas = np.logspace(-4, 0, 30) #just for ridge regression

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

## Test ML Methods
### Least Squares

In [35]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
y_test_predicted = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test, y_t_p = cross_validation(y, tx_mean, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    y_test_predicted.append(y_t_p)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []

for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_median, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE



ValueError: not enough values to unpack (expected 5, got 4)

In [None]:
## SUBMIT PREDICTION
y_test_predicted = []

test_poly = build_poly(tx_median_test, degree)

y_test_predicted = predict_labels(weights[0], test_poly)
print(y_test_predicted.shape)
print(y_test_predicted)
print(ids_test.shape)

create_csv_submission(ids_test, y_test_predicted, "submission-7")

### Ridge regression using normal equations

In [None]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data

accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_mean, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))

    
  
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []


accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_median, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))
    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


### Least Squares with normalization

In [36]:
# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, std_data, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE

Iteration: 0) Accuracy train: 0.804530 - Accuracy test: 0.799920 - Loss: 0.575759

Iteration: 1) Accuracy train: 0.803120 - Accuracy test: 0.806480 - Loss: 0.578220

Iteration: 2) Accuracy train: 0.806655 - Accuracy test: 0.807980 - Loss: 0.572248

Iteration: 3) Accuracy train: 0.804545 - Accuracy test: 0.800800 - Loss: 0.576161

Iteration: 4) Accuracy train: 0.803565 - Accuracy test: 0.804200 - Loss: 0.577485

Accuracy test, mean: 0.803876, min value: 0.799920, max value: 0.807980 

Accuracy train, mean: 0.804483, min value: 0.803120, max value: 0.806655 



## Division of the dataset looking on jet num
If PRI_jet_num is zero or one then some features are -999.
Divide dataset in 4 looking on jet_num 0, 1, 2 and 3.

In [99]:
column_jet_num = tx[:,22]

# create 4 sets of indexes looking on jet_num
rows_0_indexes = np.where(column_jet_num == 0)
rows_1_indexes = np.where(column_jet_num == 1)
rows_2_indexes = np.where(column_jet_num == 2)
rows_3_indexes = np.where(column_jet_num == 3)

# just for check
print(len(column_jet_num) == (len(rows_0_indexes[0]) + len(rows_1_indexes[0]) + len(rows_2_indexes[0]) + len(rows_3_indexes[0])))

# subsets looking on jet num
features_jet_0 = tx[rows_0_indexes, :]
features_jet_1 = tx[rows_1_indexes, :]
features_jet_2 = tx[rows_2_indexes, :]
features_jet_3 = tx[rows_3_indexes, :]

# just to check again
print((len(rows_0_indexes[0]) == features_jet_0.shape[1]) & (len(rows_1_indexes[0]) == features_jet_1.shape[1]) & (len(rows_2_indexes[0]) == features_jet_2.shape[1]) & (len(rows_3_indexes[0]) == features_jet_3.shape[1]))

True
True
99913


For each set look how many missing values there are!

In [140]:
indexes_missing_values_0 = np.where(features_jet_0[0] == -999)
indexes_missing_values_1 = np.where(features_jet_1[0] == -999)
indexes_missing_values_2 = np.where(features_jet_2[0] == -999)
indexes_missing_values_3 = np.where(features_jet_3[0] == -999)

print("There are %d missing values in the set for JET NUM = 0", len(indexes_missing_values_0))
# print("There are %d missing values in the set for JET NUM = 1", indexes_missing_values_1.shape[1])
# print("There are %d missing values in the set for JET NUM = 2", indexes_missing_values_2.shape[1])
# print("There are %d missing values in the set for JET NUM = 3", indexes_missing_values_3.shape[1])

(array([    0,     0,     0, ..., 99912, 99912, 99912]), array([ 4,  5,  6, ..., 26, 27, 28]))
(1, 99913, 30)
There are %d missing values in the set for JET NUM = 0 2


In [144]:
# iterate to find which columns to drop
for i in range (30):
    np.all(features_jet_0[0][:, i] == -999.)


False
False
False
False
True
True
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
False


False

In [141]:
# print(tx[tx==-999.])
# tx[tx==-999.].shape 

[-999. -999. -999. ... -999. -999. -999.]


(1580052,)