## Project 1 
#### Imports:

In [59]:
import numpy as np
from implementations import *
from proj1_helpers import *
from cross_validation import *
from pre_processing import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data set

In [60]:
""" y: class labels
    tx: features
    ids: event ids """
y, tx, ids = load_csv_data("datas/train.csv", sub_sample=False)
print(y)
print(tx)
print(ids)
# tx_test, ids_test = load_csv_data("all_data/test.csv", sub_sample=False)

[ 1. -1. -1. ...  1. -1. -1.]
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
[100000 100001 100002 ... 349997 349998 349999]


## Preprocessing 
### Replace missing values with mean and median  

In [61]:
mean_array = find_mean(tx)
median_array = find_median(tx)
tx_mean = replace_missing_values(tx, mean_array)
tx_median = replace_missing_values(tx, median_array)

no_y, tx_test, ids_test = load_csv_data("datas/test.csv", sub_sample=False)
mean_array_test = find_mean(tx_test)
median_array_test = find_median(tx_test)
tx_mean_test = replace_missing_values(tx_test, mean_array_test)
tx_median_test = replace_missing_values(tx_test, median_array_test)

## Cross validation 
K-fold cross-validation: original sample randomly partitioned into k equal sized subsamples.
Repeated k times.

In [62]:
seed = 19
degree = 7
k_fold = 5

lambdas = np.logspace(-4, 0, 30) #just for ridge regression

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

## Test ML Methods
### Least Squares

In [63]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
y_test_predicted = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test, y_t_p = cross_validation(y, tx_mean, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    y_test_predicted.append(y_t_p)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []
weights = []
y_test_predicted = []


for k in range(k_fold):
    loss, w, single_accuracy_train, single_accuracy_test, y_t_p = cross_validation(y, tx_median, k_indices, k, degree, least_squares)
    accuracy_train.append(single_accuracy_train)
    accuracy_test.append(single_accuracy_test)
    losses.append(loss)
    weights.append(w)
    y_test_predicted.append(y_t_p)

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE

Iteration: 0) Accuracy train: 0.804520 - Accuracy test: 0.799740 - Loss: 0.575760

Iteration: 1) Accuracy train: 0.800310 - Accuracy test: 0.802020 - Loss: 0.585639

Iteration: 2) Accuracy train: 0.806710 - Accuracy test: 0.808080 - Loss: 0.572250

Iteration: 3) Accuracy train: 0.804575 - Accuracy test: 0.800720 - Loss: 0.576161

Iteration: 4) Accuracy train: 0.803565 - Accuracy test: 0.804200 - Loss: 0.577485

Accuracy test, mean: 0.802952, min value: 0.799740, max value: 0.808080 

Accuracy train, mean: 0.803936, min value: 0.800310, max value: 0.806710 

RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE

Iteration: 0) Accuracy train: 0.805525 - Accuracy test: 0.801640 - Loss: 0.573839

Iteration: 1) Accuracy train: 0.804210 - Accuracy test: 0.807480 - Loss: 0.576331

Iteration: 2) Accuracy train: 0.808065 - Accuracy test: 0.809400 - Loss: 0.569957

Iteration: 3) Accuracy train: 0.805485 

In [70]:
## SUBMIT PREDICTION
test_poly = build_poly(tx_median_test, degree)

y_test_predicted = predict_labels(weights[0], test_poly)
print(y_test_predicted.shape)
print(y_test_predicted)
print(ids_test.shape)

create_csv_submission(ids_test, y_test_predicted, "submission-7")

(568238,)
[-1. -1. -1. ...  1. -1. -1.]
(568238,)


### Ridge regression using normal equations

In [6]:
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE\n")

# define lists to store the accuracy of training data and test data

accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_mean, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))

    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))

# %%%%%%%%%%%%%%%%%%%%%%%%%%
print("RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEDIAN VALUE\n")

# define lists to store the accuracy of training data and test data
accuracy_train = []
accuracy_test = []
losses = []


accuracy_train = []
accuracy_test = []
losses = []
for lambda_ in lambdas:
    accuracy_train_temp = []
    accuracy_test_temp = []
    losses_temp = []
    for k in range(k_fold):
        loss, w, single_accuracy_train, single_accuracy_test = cross_validation(y, tx_median, k_indices, k, degree, ridge_regression, lambda_=lambda_)
        accuracy_train_temp.append(single_accuracy_train)
        accuracy_test_temp.append(single_accuracy_test)
        losses_temp.append(loss)
    accuracy_train.append(np.mean(accuracy_train_temp))
    accuracy_test.append(np.mean(accuracy_test_temp))
    losses.append(np.mean(losses_temp))
    
n = len(accuracy_train)
for i in range(n):
    print("Iteration: %d) Accuracy train: %f - Accuracy test: %f - Loss: %f\n" % (i, accuracy_train[i], accuracy_test[i], losses[i]))

mean_accuracy_test = np.mean(accuracy_test)
min_accuracy_test = np.min(accuracy_test)
max_accuracy_test = np.max(accuracy_test)

mean_accuracy_train = np.mean(accuracy_train)
min_accuracy_train = np.min(accuracy_train)
max_accuracy_train = np.max(accuracy_train)

print("Accuracy test, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_test, min_accuracy_test, max_accuracy_test))
print("Accuracy train, mean: %f, min value: %f, max value: %f \n" %(mean_accuracy_train, min_accuracy_train, max_accuracy_train))


RESULTS OBTAINED USING k=5 AND REPLACING MISSING VALUES WITH MEAN VALUE

Iteration: 0) Accuracy train: 0.804027 - Accuracy test: 0.803536 - Loss: 0.576613

Iteration: 1) Accuracy train: 0.803951 - Accuracy test: 0.803472 - Loss: 0.576737

Iteration: 2) Accuracy train: 0.803887 - Accuracy test: 0.803388 - Loss: 0.576868

Iteration: 3) Accuracy train: 0.803800 - Accuracy test: 0.803276 - Loss: 0.577000

Iteration: 4) Accuracy train: 0.803756 - Accuracy test: 0.803220 - Loss: 0.577129

Iteration: 5) Accuracy train: 0.803686 - Accuracy test: 0.803168 - Loss: 0.577250

Iteration: 6) Accuracy train: 0.803650 - Accuracy test: 0.803116 - Loss: 0.577364

Iteration: 7) Accuracy train: 0.803587 - Accuracy test: 0.803092 - Loss: 0.577471

Iteration: 8) Accuracy train: 0.803514 - Accuracy test: 0.803136 - Loss: 0.577576

Iteration: 9) Accuracy train: 0.803502 - Accuracy test: 0.803104 - Loss: 0.577681

Iteration: 10) Accuracy train: 0.803409 - Accuracy test: 0.803016 - Loss: 0.577790

Iteration: 11

(250000, 30)

In [141]:
# print(tx[tx==-999.])
# tx[tx==-999.].shape 

[-999. -999. -999. ... -999. -999. -999.]


(1580052,)