## Project 1 
Import dataset from proj1_helpers.py

In [1]:
from proj1_helpers import *

### ML Methods
Import ML methods from implementations.py

In [2]:
from implementations import *

#### Test ML Methods

In [3]:
import numpy as np
""" y: class labels
    tx: features
    ids: event ids """
y, tx, ids = load_csv_data("datas/train.csv", sub_sample=False)
print(y)
print(tx)
print(ids)

[ 1. -1. -1. ...  1. -1. -1.]
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
[100000 100001 100002 ... 349997 349998 349999]


##### Matrix Standardization
Preprocessing step to standardize the data, i.e. subtract the mean and divide by the standard deviation for each dimension. After this processing, each dimension has zero mean and unit variance.

In [4]:
def standardize(x):
    centered_data = x - np.mean(x, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    return std_data

std_data_tx = standardize(tx)
print(std_data_tx)

[[ 0.46141372  0.06833197  0.40768027 ...  1.5668      1.55858439
   0.4125105 ]
 [ 0.51670419  0.55250482  0.54013641 ... -0.63936657 -0.63936694
  -0.27381996]
 [-2.33785898  3.19515553  1.09655998 ... -0.63936657 -0.63936694
  -0.29396985]
 ...
 [ 0.38016991  0.31931645 -0.13086367 ... -0.63936657 -0.63936694
  -0.31701723]
 [ 0.35431502 -0.84532397 -0.30297338 ... -0.63936657 -0.63936694
  -0.74543941]
 [-2.33785898  0.66533608 -0.25352276 ... -0.63936657 -0.63936694
  -0.74543941]]


In [5]:
# def test_our_methods():
#     y_pred = predict_labels()
#     create_csv_submission(ids, y_pred,  "Project1")


### Replace missing values with mean, mode, median

In [6]:
def clean_array(tx):
    return np.ma.masked_values(tx, -999.) # Mask the array in order to not have -999.

def find_mean(tx):
    return (clean_array(tx)).mean(axis=0)

def find_median(tx):
    return np.ma.median(clean_array(tx), axis=0)
    
mean_array = find_mean(tx)
median_array = find_median(tx)

def replace_missing_values(tx, new_values):
    x = np.copy(tx)
    indices = np.where(x == -999.)
    x[indices] = np.take(new_values, indices[1])
    return x

tx_mean = replace_missing_values(tx, mean_array)
tx_median = replace_missing_values(tx, median_array)

In [7]:
# np.where(tx == -999.)
clean_array(tx)

masked_array(
  data=[[138.47, 51.655, 97.827, ..., 1.24, -2.475, 113.497],
        [160.937, 68.768, 103.235, ..., --, --, 46.226],
        [--, 162.172, 125.953, ..., --, --, 44.251],
        ...,
        [105.457, 60.526, 75.839, ..., --, --, 41.992],
        [94.951, 19.362, 68.812, ..., --, --, 0.0],
        [--, 72.756, 70.831, ..., --, --, 0.0]],
  mask=[[False, False, False, ..., False, False, False],
        [False, False, False, ...,  True,  True, False],
        [ True, False, False, ...,  True,  True, False],
        ...,
        [False, False, False, ...,  True,  True, False],
        [False, False, False, ...,  True,  True, False],
        [ True, False, False, ...,  True,  True, False]],
  fill_value=-999.0)

In [8]:
# test = np.array([ np.array([1, 4, -999., -2, -2]), np.array([2, -4, -999., -999., -3]) ])
# find_median(test)
# replace_missing_values(test, find_median(test))

# max(test, key = test.count)

## Cross validation 
K-fold cross-validation: original sample randomly partitioned into k equal sized subsamples.
Repeated k times.

In [9]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [10]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    # ridge regression
    w = ridge_regression(y_tr, tx_tr, lambda_)
    # calculate the loss for train and test data
    err1 = y_tr - tx_tr.dot(w)
    loss_tr = np.sqrt(2 * compute_loss(y_tr, tx_tr, w))
    loss_te = np.sqrt(2 * compute_loss(y_te, tx_te, w))
    return loss_tr, loss_te,w

In [None]:
seed = 19
degree = 7
k_fold = 5
lambdas = np.logspace(-4, 0, 30)
# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)
# define lists to store the loss of training data and test data
rmse_tr = []
rmse_te = []
# cross validation
for lambda_ in lambdas:
    rmse_tr_tmp = []
    rmse_te_tmp = []
    for k in range(k_fold):
        loss_tr, loss_te,_ = cross_validation(y, tx, k_indices, k, lambda_, degree)
        rmse_tr_tmp.append(loss_tr)
        rmse_te_tmp.append(loss_te)
    rmse_tr.append(np.mean(rmse_tr_tmp))
    rmse_te.append(np.mean(rmse_te_tmp))

(250000, 30)

In [141]:
# print(tx[tx==-999.])
# tx[tx==-999.].shape 

[-999. -999. -999. ... -999. -999. -999.]


(1580052,)