In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
import random
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # train data path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(tX.shape)

(250000, 30)


### Changing the labels to {0,1}

In [4]:
# y == 0 non detected Boson, y == 1 detected Boson
y_ = np.array([0 if l == -1 else 1 for l in y])

### Dividing the features by the number of jets

In [5]:
# dividing the rows of tX by the number of jets, dropping the column Pri_Jet_Num and adding an extra column of np.ones
zero_indices = []
one_indices = []
two_three_indices = []
zero_indices = np.where(tX[:,22]==0)[0]
one_indices = np.where(tX[:,22]==1)[0]
two_three_indices = np.where(np.logical_or(tX[:,22]==2, tX[:,22]==3))[0]
tX_0 = tX[zero_indices, :]
tX_0 = np.delete(tX_0, 22, axis=1)
tX_1 = tX[one_indices, :]
tX_1 = np.delete(tX_1, 22, axis=1)
tX_2_3 = tX[two_three_indices, :]

### Dividing also the output by the type of particle

In [6]:
y_0 = y_[zero_indices]
y_1 = y_[one_indices]
y_2_3 = y_[two_three_indices]

### Adding a column of zeros and ones to detect whether the mass has been measured or not

In [7]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
zero_indices_0 = np.where(tX_0[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_0 else 1 for i in range(tX_0.shape[0])])
tX_0 = np.insert(tX_0, 0, column_to_add, axis=1)
zero_indices_1 = np.where(tX_1[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_1 else 1 for i in range(tX_1.shape[0])])
tX_1 = np.insert(tX_1, 0, column_to_add, axis=1)
zero_indices_2_3 = np.where(tX_2_3[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_2_3 else 1 for i in range(tX_2_3.shape[0])])
tX_2_3 = np.insert(tX_2_3, 0, column_to_add, axis=1)

### Throwing away the outliers from the training data

In [8]:
for i in range(1, tX_2_3.shape[1]):
    index_column_valid =np.where(tX_2_3[:,i] != -999.)[0]
    column_25_quantile, column_75_quantile = np.quantile(tX_2_3[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
    interquantile = column_75_quantile-column_25_quantile
    column_15_quantile, column_85_quantile = np.quantile(tX_2_3[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
    indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_2_3[index_column_valid,i])
                                             | (tX_2_3[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
    #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
    median = np.median(tX_2_3[index_column_valid, i], axis = 0)
    #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
    #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
    #print(median)
    tX_2_3[index_column_valid[indices_outliers],i] =  median

In [9]:
col_to_delete_0 = []
for i in range(1, tX_0.shape[1]):
    index_column_valid =np.where(tX_0[:,i] != -999.)[0]
    if len(index_column_valid)==0:
        #we drop the column (we will have to do the same for the test set as well)
        col_to_delete_0.append(i)
    else :
        column_25_quantile, column_75_quantile = np.quantile(tX_0[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
        interquantile = column_75_quantile-column_25_quantile
        column_15_quantile, column_85_quantile = np.quantile(tX_0[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
        indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_0[index_column_valid,i])
                                             | (tX_0[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
        #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
        median = np.median(tX_0[index_column_valid, i], axis = 0)
        #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
        #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
        #print(median)
        tX_0[index_column_valid[indices_outliers],i] =  median
col_to_delete_0.append(tX_0.shape[1]-1)
tX_0 = np.delete(tX_0, col_to_delete_0, axis=1)
print(tX_0.shape)
print(col_to_delete_0)

(99913, 19)
[5, 6, 7, 13, 23, 24, 25, 26, 27, 28, 29]


In [10]:
col_to_delete_1 = []
for i in range(1, tX_1.shape[1]):
    index_column_valid =np.where(tX_1[:,i] != -999.)[0]
    if len(index_column_valid)==0:
        #we drop the column (we will have to do the same for the test set as well)
        col_to_delete_1.append(i)
    else :
        column_25_quantile, column_75_quantile = np.quantile(tX_1[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
        interquantile = column_75_quantile-column_25_quantile
        column_15_quantile, column_85_quantile = np.quantile(tX_1[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
        indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_1[index_column_valid,i])
                                             | (tX_1[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
        #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
        median = np.median(tX_1[index_column_valid, i], axis = 0)
        #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
        #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
        #print(median)
        tX_1[index_column_valid[indices_outliers],i] =  median
tX_1 = np.delete(tX_1, col_to_delete_1, axis=1)
print(col_to_delete_1)

[5, 6, 7, 13, 26, 27, 28]


### Now we substitute the -999 values with the median

In [11]:
for i in range(1, tX_2_3.shape[1]):
    index_column_non_valid =np.where(tX_2_3[:,i] == -999.)[0]
    index_column_valid =np.where(tX_2_3[:,i] != -999.)[0]
    median = np.median(tX_2_3[index_column_valid, i], axis = 0)
    tX_2_3[index_column_non_valid,i] =  median

In [12]:
for i in range(1, tX_1.shape[1]):
    index_column_non_valid =np.where(tX_1[:,i] == -999.)[0]
    index_column_valid =np.where(tX_1[:,i] != -999.)[0]
    median = np.median(tX_1[index_column_valid, i], axis = 0)
    tX_1[index_column_non_valid,i] =  median

In [13]:
for i in range(1, tX_0.shape[1]):
    index_column_non_valid =np.where(tX_0[:,i] == -999.)[0]
    index_column_valid =np.where(tX_0[:,i] != -999.)[0]
    median = np.median(tX_0[index_column_valid, i], axis = 0)
    tX_0[index_column_non_valid,i] =  median

### Now we standardize the data

In [14]:
tX_2_3[:,1:], mean_2_3,std_2_3 = standardize(tX_2_3[:,1:]) #we standardize everything a part from the column added manually

In [15]:
print(tX_2_3)
print(np.count_nonzero(tX_2_3 == -999.))

[[ 1.          0.97351249  0.46588281 ...  0.61614788 -1.36131161
  -0.70374641]
 [ 1.         -0.82851663 -0.77157038 ...  0.11608109  1.71034105
   0.2995537 ]
 [ 1.          1.3538447  -0.27431587 ...  0.07030726 -1.52202162
   0.12704911]
 ...
 [ 1.          0.04006392  0.02513449 ...  0.25930888  0.22982758
   0.42357227]
 [ 1.          0.66304099 -1.0843679  ...  0.29031696 -1.21821366
  -0.20699627]
 [ 1.          0.04006392  0.31977857 ... -0.02271698 -0.62490751
   0.05569682]]
0


In [16]:
print(tX_0)

[[ 1.00000e+00  1.43905e+02  8.14170e+01 ...  3.10820e+01  6.00000e-02
   8.60620e+01]
 [ 1.00000e+00  1.75864e+02  1.69150e+01 ...  2.72300e+00 -8.71000e-01
   5.31310e+01]
 [ 1.00000e+00  1.05594e+02  5.05590e+01 ...  3.77910e+01  2.40000e-02
   1.29804e+02]
 ...
 [ 1.00000e+00  1.11452e+02  5.81790e+01 ...  4.67370e+01 -8.67000e-01
   8.04080e+01]
 [ 1.00000e+00  9.49510e+01  1.93620e+01 ...  1.21500e+01  8.11000e-01
   1.12718e+02]
 [ 1.00000e+00  1.11452e+02  7.27560e+01 ...  4.07290e+01 -1.59600e+00
   9.94050e+01]]


In [17]:
tX_0[:,1:],mean_0,std_0 = standardize(tX_0[:,1:]) 

In [18]:
print(tX_0.shape)

(99913, 19)


In [19]:
tX_1[:,1:],mean_1,std_1 = standardize(tX_1[:,1:])

In [20]:
print(tX_1)

[[ 1.00000000e+00  1.55539992e+00  7.27047143e-01 ...  3.98445313e-01
   6.45414781e-01 -4.14297220e-01]
 [ 1.00000000e+00  1.45598722e-03  3.58462301e+00 ...  1.12748232e+00
  -1.10752634e+00 -4.89864560e-01]
 [ 1.00000000e+00  1.36261180e+00 -1.05809645e+00 ... -3.92076740e-01
  -9.40265168e-01 -1.01072441e+00]
 ...
 [ 1.00000000e+00  1.45598722e-03  1.01732036e+00 ... -4.67286130e-01
  -3.80160315e-01  8.39087556e-01]
 [ 1.00000000e+00  6.75509966e-01  9.95415259e-01 ... -6.76994064e-01
   1.39533906e+00  5.32418071e-01]
 [ 1.00000000e+00 -2.21030015e-01  4.74893699e-01 ...  9.88591985e-01
  -8.30516498e-02 -5.76298292e-01]]


### We insert the column for the bias term

In [21]:
tX_tilda_0 = np.insert(tX_0, 0, np.ones(tX_0.shape[0]), axis=1)
tX_tilda_1 = np.insert(tX_1, 0, np.ones(tX_1.shape[0]), axis=1)
tX_tilda_2_3 = np.insert(tX_2_3, 0, np.ones(tX_2_3.shape[0]), axis=1)

In [22]:
# colors = ['red', 'blue']
# x_pos=[]
# x_neg=[]

# for j in range(len(y)):
#  if(y[j]==1):
#       x_pos.insert(0,tX[j])
#    else:
#        x_neg.insert(0,tX[j])
# xpos = np.array(x_pos)
# xneg = np.array(x_neg)
# for i in range(tX.shape[1]):
#  plt.hist(xpos[:,i], alpha = 0.5, color = 'r', bins = 100)
#  plt.hist(xneg[:,i], alpha = 0.5, color = 'b', bins = 100)
#  plt.show()

## Do your thing crazy machine learning thing here :) ...

In [23]:
def compute_loss(y, tx, w):
    N = y.shape[0]
    e = y - tx @ w 
    loss = 1/(2*N) * np.dot(e,e)
    return loss

In [24]:
def compute_gradient(y, tx, w):
    N = y.shape[0]
    e = y - tx @ w
    gradient = -(1/N) * (tx.T) @ (e)
    return gradient

In [25]:
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        loss = compute_loss(y,tx,w)
        gradient = compute_gradient(y,tx,w)
        w = w - gamma * gradient
        ws.append(w)
        losses.append(loss)
        if n_iter %100==0:
            print('gradient descent loss', loss)
        # print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
        # bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return losses, ws

In [26]:
def compute_stoch_gradient(y, tx, w):
    N = y.shape[0]
    random_number = random.randint(0,N)
    #random_number =1
    xn = tx[random_number,:]
    random_gradient = - np.dot(xn, y[random_number] - np.dot(xn,w))
    return random_gradient

In [27]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        loss = compute_loss(y,tx,w)
        stoch_gradient = compute_stoch_gradient(y,tx,w)
        w = w - gamma * stoch_gradient
        ws.append(w)
        losses.append(loss)
        if n_iter %100==0:
            print('gradient descent loss', loss)
        # print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
        #    bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return losses, ws

In [28]:
from proj1_helpers import *

def least_squares(y, tx):
    """calculate the least squares solution."""
    forcing_term = np.transpose(tx) @ y
    coefficient_matrix = np.transpose(tx) @ tx
    w = np.linalg.solve(coefficient_matrix, forcing_term)
    return w

def test_your_least_squares(y, tx):
    """compare the solution of the normal equations with the weights returned by gradient descent algorithm."""
    w_least_squares = least_squares(y, tx)
    initial_w = np.zeros(tx.shape[1])
    max_iters = 50
    gamma = 0.7
    losses_gradient_descent, w_gradient_descent = gradient_descent(y, tx, initial_w, max_iters, gamma)
    w = w_gradient_descent[-1]
    err = np.linalg.norm(w_least_squares-w)
    return err

In [29]:
def cross_validation_grad_desc(y, x, k_indices, k, degree, gamma):
    """return the loss of ridge regression."""
    N = y.shape[0]
    k_fold = k_indices.shape[0]
    list_ = []
    interval = int(N/k_fold)
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1]))
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    x_testing = x[k_indices[k], :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    y_testing = y[k_indices[k]]
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    losses, ws = least_squares_GD(y_training, x_training_augmented, np.zeros(x_training_augmented.shape[1]) , 2000, gamma)
    w_opt_training = ws[-1]
    #loss_tr = compute_loss(y_training, x_training_augmented, w_opt_training)
    #loss_te = compute_loss(y_testing, x_testing_augmented, w_opt_training)
    predictions_test = x_testing_augmented@w_opt_training
    print(predictions_test)
    predictions_test = np.array([0 if el <0.5 else 1 for el in predictions_test])
    print(predictions_test)
    print(y_testing)
    acc_test = compute_accuracy(y_testing, predictions_test)
    return acc_test

In [30]:
def compute_accuracy(y_test, pred):
    N = y_test.shape[0]
    accuracy = (y_test == pred).sum() / N
    return accuracy

In [31]:
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    N = tx.shape
    lambda_prime = 2 * N[0] * lambda_
    coefficient_matrix = np.transpose(tx) @ tx + lambda_prime * np.eye(N[1])
    forcing_term = np.transpose(tx) @ y
    w = np.linalg.solve(coefficient_matrix, forcing_term)
    return w

def debug_ridge(y, tx):
    """debugging the ridge regression by setting lambda=0."""
    w_least_squares = least_squares(y, tx)
    w_0 = ridge_regression(y, tx, 0)
    err = np.linalg.norm(w_least_squares-w_0)
    return err

In [32]:
def sigmoid(t):
    """apply the sigmoid function on t."""
    if t >= 0:
        z = exp(-x)
        return 1 / (1 + z)
    else:
        z = exp(x)
        return z / (1 + z)

In [33]:
def calculate_loss(y, tx, w):
    """compute the loss: negative log likelihood."""
    term1 = sigmoid(tx @ w)
    term1[y == 0] = 1
    term2 = 1 - sigmoid(tx @ w)
    term2[y == 1] = 1
    summands = np.multiply(y, np.log(term1)) + np.multiply(1 - y, np.log(term2))
    # e = - y[i] * (tx[:,i] @ w) + np.log(1 + np.exp(1 + tx @ w))
    # return e.sum()
    # loss = - np.sum(y*np.log(sigmoid(tx@w))+((1-y) *np.log(1-sigmoid(tx@w))))
    # return e.sum()
    return - np.sum(summands) / len(y)

In [34]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    return np.transpose(tx) @ (sigmoid(tx @ w) - y)

In [35]:
def learning_by_gradient_descent(y, tx, w_initial, gamma, max_iters, lambda_):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    losses = []
    w1 = w_initial
    for iter in range(max_iters):
        grad = calculate_gradient(y, tx, w1)
        w = w1 - gamma * grad
        loss = calculate_loss(y, tx, w) + 2*lambda_*np.linalg.norm(w) ** 2
        losses.append(loss)
        w1 = w
    return losses, w

In [36]:
def calculate_hessian(y, tx, w):
    """return the Hessian of the loss function."""
    diag = sigmoid(tx @ w) * (1 - sigmoid(tx @ w))
    D = diag * np.eye(tx.shape[0])
    return np.transpose(tx) @ D @ tx

In [37]:
def logistic_regression(y, tx, w):
    """return the loss, gradient, and Hessian."""
    grad = calculate_gradient(y, tx, w)
    hess = calculate_hessian(y, tx, w)
    loss = calculate_loss(y, tx, w)
    return loss, grad, hess

In [38]:
def learning_by_newton_method(y, tx, w, gamma):
    """
    Do one step on Newton's method.
    return the loss and updated w.
    """
    loss, grad, hess = logistic_regression(y, tx, w)
    sol = np.linalg.solve(hess, grad)
    w = w - gamma * sol
    return loss, w

In [39]:
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss, gradient"""
    loss = calculate_loss(y, tx, w) + lambda_*np.linalg.norm(w) ** 2
    grad = calculate_gradient(y, tx, w) + 2*lambda_*w
    hess = calculate_hessian(y, tx, w) + 2*lambda_*np.eye(w.shape[0])
    return loss, grad, hess

In [40]:
def learning_by_penalized_gradient(y, tx, w_initial, gamma, max_iters, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    threshold = 1e-8
    losses = []
    w1 = w_initial
    for iter in range(max_iters):
        grad = calculate_gradient(y, tx, w1) + 2*lambda_*w1
        #sol = np.linalg.solve(hess, grad)
        w = w1 - gamma * grad
        loss = calculate_loss(y, tx, w) + 2*lambda_*np.linalg.norm(w) ** 2
        losses.append(loss)
        w1 = w
        if iter % 25 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    return losses, w

In [41]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=1 up to j=degree."""
    powers = np.arange(1, degree + 1)
    #phi = np.column_stack([np.power(x[:,0], exponent) for exponent in powers])
    phi = x[:,0]
    for i in range(1, x.shape[1]):
        phi_i = np.column_stack([np.power(x[:,i], exponent) for exponent in powers])
        phi = np.column_stack([phi, phi_i])
    return phi

In [42]:
def build_k_indices(y, k_fold, seed):
    N = y.shape[0]
    np.random.seed(seed)
    interval = int(np.floor(N / k_fold))
    indices = np.random.permutation(N)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [43]:
def cross_validation_ridge(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    N = y.shape[0]
    k_fold = k_indices.shape[0]
    list_ = []
    interval = int(N/k_fold)
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1]))
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    x_testing = x[k_indices[k], :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    y_testing = y[k_indices[k]]
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    w_opt_training = ridge_regression(y_training, x_training_augmented, lambda_)
    loss_tr = compute_loss(y_training, x_training_augmented, w_opt_training)
    loss_te = compute_loss(y_testing, x_testing_augmented, w_opt_training)
    return loss_tr, loss_te

In [44]:
def random_interval(low, high, size):
    sample = np.random.uniform(low, high, size)
    return sample

We will tune hyperparameters for simple models

### Hyperparameters tuning for GD

In [46]:
degrees = np.arange(1, 5)
k_fold = 4
seed = 1
testing_acc = np.zeros(len(degrees))
k_indices = build_k_indices(y_0, k_fold, seed)
for index in range(len(degrees)):
    current_sum_test = 0
    for k in range(k_fold):
        current_test_acc = cross_validation_grad_desc(y_0, tX_tilda_0, k_indices, k, degrees[index], gamma = 5*10e-4)
        print(current_test_acc)
        current_sum_test += current_test_acc
    testing_acc[index] = current_sum_test / k_fold
best_result = np.where(testing_acc == np.amax(testing_acc))
print(testing_acc)
degree_opt = degrees[best_result[0]]
print(degree_opt)

gradient descent loss 0.12732537966744067
gradient descent loss 0.07514296948276802
gradient descent loss 0.06936189627881702
gradient descent loss 0.06841761462572284
gradient descent loss 0.06810393485501035
gradient descent loss 0.06791644525713052
gradient descent loss 0.06777792666186155
gradient descent loss 0.0676692519628826
gradient descent loss 0.06758201240116901
gradient descent loss 0.06751108696409762
gradient descent loss 0.06745293018640557
gradient descent loss 0.06740493987089534
gradient descent loss 0.06736513926657837
gradient descent loss 0.06733199194963059
gradient descent loss 0.06730428464704838
gradient descent loss 0.06728104840752037
gradient descent loss 0.06726150295256433
gradient descent loss 0.0672450159057109
gradient descent loss 0.06723107209065948
gradient descent loss 0.06721924995054254
[ 0.61847606  0.06020022  0.006644   ...  0.5805158   0.28062325
 -0.01899064]
[1 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
0.8145968452237969
gradient descent loss 0.1268

  gradient = -(1/N) * (tx.T) @ (e)
  w = w - gamma * gradient


gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
[nan nan nan ... nan nan nan]
[1 1 1 ... 1 1 1]
[0 0 0 ... 1 0 0]
0.25658579550004
gradient descent loss 0.12689166466490512
gradient descent loss 2.288521501703905e+39
gradient descent loss 4.368213387358128e+81
gradient descent loss 8.337823430229431e+123
gradient descent loss 1.5914813080076442e+166
gradient descent loss 3.037738535640932e+208
gradient descent loss 5.798280736622684e+250
gradient descent loss 1.1067463215232017e+293
gradient descent loss inf
gradient descent loss inf
gradient descent loss inf
gradient descent loss inf
gradient descent loss inf
gradient descent loss inf
gradient descent loss inf
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
[nan nan nan ... nan nan nan]
[1 1 1 ... 1 1 1]
[1 0 0 ... 1 0 0]
0.2591880855152534
gradient descent loss 0.12839298582752823
gradient descent 

  e = y - tx @ w


gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
[nan nan nan ... nan nan nan]
[1 1 1 ... 1 1 1]
[0 0 0 ... 1 0 0]
0.25658579550004
gradient descent loss 0.12689166466490512
gradient descent loss inf
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent loss nan
gradient descent 

degree=2

In [None]:
degrees = np.arange(1, 5)
k_fold = 4
seed = 1
testing_acc = np.zeros(len(degrees))
k_indices = build_k_indices(y_1, k_fold, seed)
for index in range(len(degrees)):
    current_sum_test = 0
    for k in range(k_fold):
        current_test_acc = cross_validation_grad_desc(y_1, tX_tilda_1, k_indices, k, degrees[index], gamma = 5*10e-4)
        print(current_test_acc)
        current_sum_test += current_test_acc
    testing_acc[index] = current_sum_test / k_fold
best_result = np.where(testing_acc == np.amax(testing_acc))
print(testing_acc)
degree_opt = degrees[best_result[0]]
print(degree_opt)

In [None]:
# trained on colab 
#[0.73121067 0.77632054 0.7937687  0.35734551]
# [3]

In [48]:
degrees = np.arange(1, 5)
k_fold = 4
seed = 1
testing_acc = np.zeros(len(degrees))
k_indices = build_k_indices(y_2_3, k_fold, seed)
for index in range(len(degrees)):
    current_sum_test = 0
    for k in range(k_fold):
        current_test_acc = cross_validation_grad_desc(y_2_3, tX_tilda_2_3, k_indices, k, degrees[index], gamma = 5*10e-4)
        print(current_test_acc)
        current_sum_test += current_test_acc
    testing_acc[index] = current_sum_test / k_fold
best_result = np.where(testing_acc == np.amax(testing_acc))
print(testing_acc)
degree_opt = degrees[best_result[0]]
print(degree_opt)

gradient descent loss 0.2236201224107192
gradient descent loss 0.10262907520410727
gradient descent loss 0.08726719153685297
gradient descent loss 0.0846866825602424
gradient descent loss 0.08398919737630413
gradient descent loss 0.0836628176426034
gradient descent loss 0.0834558852571109
gradient descent loss 0.08330955837237403
gradient descent loss 0.083201796179067
gradient descent loss 0.08312084167429568
gradient descent loss 0.08305926949627276
gradient descent loss 0.08301202536538124
gradient descent loss 0.08297553119102864
gradient descent loss 0.08294719051001828
gradient descent loss 0.08292508594823815
gradient descent loss 0.08290778282908313
gradient descent loss 0.08289419654678115
gradient descent loss 0.08288350037316669
gradient descent loss 0.08287505991832408
gradient descent loss 0.08286838569533451
[ 0.00424628  0.09774473  0.34984029 ... -0.05067441  0.28867761
  0.22763769]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
0.7666390956713537
gradient descent loss 0.22333523

KeyboardInterrupt: 

In [None]:
#[0.76582575 0.8072236  0.81856907 0.44751861]
#[3]

In [None]:
#degree = 5, lr = 0.0001

### Hyperparameters tuning for Ridge Regression

In [None]:
degrees = np.arange(2, 7)
lambdas = np.logspace(-5,0,15)
k_fold = 5
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_0, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0
        test_loss = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_0, tX_tilda_0, k_indices, k,
                                                lambdas[index1], degrees[index2])
            train_loss += loss_tr
            test_loss += loss_te
        training_loss[index1, index2] = train_loss / k_fold
        testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
print(testing_loss)
lambda_opt, degree_opt = lambdas[best_result[0]],degrees[best_result[1]]
print(lambda_opt, degree_opt)

In [None]:
degrees = np.arange(2, 7)
lambdas = np.logspace(-5,0,15)
k_fold = 5
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_1, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0
        test_loss = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_1, tX_tilda_1, k_indices, k, 
                                                lambdas[index1], degrees[index2])
            train_loss += loss_tr
            test_loss += loss_te
        training_loss[index1, index2] = train_loss / k_fold
        testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
print(testing_loss)
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(lambda_opt, degree_opt)

In [None]:
degrees = np.arange(2, 7)
lambdas = np.logspace(-5,0,15)
k_fold = 5
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_2_3, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0
        test_loss = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_2_3, tX_tilda_2_3, k_indices, k,
                                            lambdas[index1], degrees[index2])
            train_loss += loss_tr
            test_loss += loss_te
        training_loss[index1, index2] = train_loss / k_fold
        testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(testing_loss)
print(lambda_opt, degree_opt) 

We train the model for ridge regression with the best hyperparameters

In [None]:
tX_tilda_0_augmented = build_poly(tX_tilda_0, degree = 6)
losses_0, ws_0 = least_squares_GD(y_0, tX_tilda_0_augmented, np.zeros(tX_tilda_0_augmented.shape[1]) , 2000, 5*10e-4)
w_opt_training = ws_0[-1]

In [None]:
tX_tilda_1_augmented = build_poly(tX_tilda_1, degree=6)
w_ridge_1 = ridge_regression(y_1, tX_tilda_1_augmented, lambda_= 5.17947468e-05)
#print(w_ridge_1)

In [None]:
tX_tilda_2_3_augmented = build_poly(tX_tilda_2_3, degree=6)
w_ridge_2_3 = ridge_regression(y_2_3, tX_tilda_2_3_augmented, lambda_= 0.00026827)
#print(w_ridge_2_3)

We train the model for gradient descend

In [49]:
tX_tilda_0_augmented = build_poly(tX_tilda_0, degree = 2)
losses_0, ws_0 = least_squares_GD(y_0, tX_tilda_0_augmented, np.zeros(tX_tilda_0_augmented.shape[1]) , 2000, 5*10e-4)
w_opt_training_0 = ws_0[-1]

gradient descent loss 0.12757098675847986
gradient descent loss 0.06693729315965935
gradient descent loss 0.06423988053222848
gradient descent loss 0.06349600121068293
gradient descent loss 0.06317784862147327
gradient descent loss 0.06300341150092016
gradient descent loss 0.06288906199676206
gradient descent loss 0.0628047769813995
gradient descent loss 0.0627380400167114
gradient descent loss 0.06268287144006163
gradient descent loss 0.06263604502702481
gradient descent loss 0.06259562738149475
gradient descent loss 0.06256035290731772
gradient descent loss 0.06252933119056518
gradient descent loss 0.06250189909053598
gradient descent loss 0.062477540811954696
gradient descent loss 0.06245584199224412
gradient descent loss 0.06243646175207266
gradient descent loss 0.06241911471243574
gradient descent loss 0.06240355881388663


In [51]:
tX_tilda_1_augmented = build_poly(tX_tilda_1, degree = 3)
losses_1, ws_1 = least_squares_GD(y_1, tX_tilda_1_augmented, np.zeros(tX_tilda_1_augmented.shape[1]) , 2000, 5*10e-4)
w_opt_training_1= ws_1[-1]

gradient descent loss 0.17867275353347778
gradient descent loss 0.08789943780458429
gradient descent loss 0.08252041452103928
gradient descent loss 0.0803449184060501
gradient descent loss 0.07916736832316855
gradient descent loss 0.07841798061295603
gradient descent loss 0.07788462300948416
gradient descent loss 0.07747528776071969
gradient descent loss 0.07714512111338145
gradient descent loss 0.07686991708105946
gradient descent loss 0.07663541615435647
gradient descent loss 0.07643255855078472
gradient descent loss 0.0762551995663821
gradient descent loss 0.07609893304282081
gradient descent loss 0.07596045029452171
gradient descent loss 0.0758371729669143
gradient descent loss 0.07572703272459488
gradient descent loss 0.07562833291373816
gradient descent loss 0.07553965777832575
gradient descent loss 0.07545981032904109


In [52]:
tX_tilda_2_3_augmented = build_poly(tX_tilda_2_3, degree=3)
losses_2_3, ws_2_3 = least_squares_GD(y_2_3, tX_tilda_2_3_augmented, np.zeros(tX_tilda_2_3_augmented.shape[1]) , 2000, 5*10e-4)
w_opt_training_2_3 = ws_2_3[-1]

gradient descent loss 0.22376383662103855
gradient descent loss 0.08451346904069876
gradient descent loss 0.07782752986711347
gradient descent loss 0.07478524180658218
gradient descent loss 0.07306438656955777
gradient descent loss 0.07197319267478261
gradient descent loss 0.0712203720913397
gradient descent loss 0.07066568518289455
gradient descent loss 0.07023572564349713
gradient descent loss 0.06988939848806114
gradient descent loss 0.0696023115260533
gradient descent loss 0.0693591987810031
gradient descent loss 0.06915002612973116
gradient descent loss 0.06896789250973727
gradient descent loss 0.06880785188808104
gradient descent loss 0.06866622732844295
gradient descent loss 0.06854019747655961
gradient descent loss 0.06842753869095734
gradient descent loss 0.0683264587989534
gradient descent loss 0.06823548639681055


We will now try with logistic regression

In [None]:
def cross_validation_logistic(y, x, k_indices, k, lambda_, degree, gamma):
    """return the loss of ridge regression."""
    N = y.shape[0]
    k_fold = k_indices.shape[0]
    list_ = []
    interval = int(N/k_fold)
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1]))
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    x_testing = x[k_indices[k], :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    y_testing = y[k_indices[k]]
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    #w_opt_training = ridge_regression(y_training, x_training_augmented, lambda_)
    _,  w_opt_training = learning_by_penalized_gradient(y_training, x_training_augmented,
                                                        np.zeros(x_training_augmented.shape[1]), gamma, 10, lambda_)
    loss_tr = compute_loss(y_training, x_training_augmented, w_opt_training) + lambda_*np.linalg.norm(w_opt_training) ** 2
    loss_te = compute_loss(y_testing, x_testing_augmented, w_opt_training) + lambda_*np.linalg.norm(w_opt_training) ** 2
    return loss_tr, loss_te

We perform cross validation in order to find the best parameters degree, lamdba and gamma caracterizing logistic regression

In [None]:
degrees = np.arange(2, 6)
lambdas = np.arange(2, 6) / 10
gammas = np.logspace(0.01, 10, 10)
k_fold = 3
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
testing_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
k_indices = build_k_indices(y_0, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        for index3 in range(len(gammas)):
            train_loss = 0
            test_loss = 0
            for k in range(k_fold):
                loss_tr, loss_te = cross_validation_logistic(y_0, tX_tilda_0, k_indices, k,
                                                lambdas[index1], degrees[index2], gammas[index3])
                train_loss += loss_tr
                test_loss += loss_te
            training_loss[index1, index2, index3] = train_loss / k_fold
            testing_loss[index1, index2, index3] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt, gamma_opt = lambdas[best_result[0]],degrees[best_result[1]], gammas[best_result[2]]
print(lambda_opt, degree_opt, gamma_opt)

In [None]:
degrees = np.arange(2, 6)
lambdas = np.arange(2, 6) / 10
gammas = np.logspace(0.01, 10, 10)
k_fold = 3
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
testing_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
k_indices = build_k_indices(y_0, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        for index3 in range(len(gammas)):
            train_loss = 0
            test_loss = 0
            for k in range(k_fold):
                loss_tr, loss_te = cross_validation_logistic(y_0, tX_tilda_1, k_indices, k,
                                                lambdas[index1], degrees[index2], gammas[index3])
                train_loss += loss_tr
                test_loss += loss_te
            training_loss[index1, index2, index3] = train_loss / k_fold
            testing_loss[index1, index2, index3] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt, gamma_opt = lambdas[best_result[0]],degrees[best_result[1]], gammas[best_result[2]]
print(lambda_opt, degree_opt, gamma_opt)

In [None]:
degrees = np.arange(2, 6)
lambdas = np.arange(2, 6) / 10
gammas = np.logspace(0.01, 10, 10)
k_fold = 3
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
testing_loss = np.zeros((len(lambdas), len(degrees), len(gammas)))
k_indices = build_k_indices(y_0, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        for index3 in range(len(gammas)):
            train_loss = 0
            test_loss = 0
            for k in range(k_fold):
                loss_tr, loss_te = cross_validation_logistic(y_0, tX_tilda_2_3, k_indices, k,
                                                lambdas[index1], degrees[index2], gammas[index3])
                train_loss += loss_tr
                test_loss += loss_te
            training_loss[index1, index2, index3] = train_loss / k_fold
            testing_loss[index1, index2, index3] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt, gamma_opt = lambdas[best_result[0]],degrees[best_result[1]], gammas[best_result[2]]
print(lambda_opt, degree_opt, gamma_opt)

In [None]:
### Generate predictions and save ouput in csv format for submission:

In [53]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [54]:
print(tX_test.shape)

(568238, 30)


We will now format the tX_test as we did for tX_train

### we split the test into the three subgroups

In [55]:
zero_indices = []
one_indices = []
two_three_indices = []
zero_indices = np.where(tX_test[:,22]==0)[0]
one_indices = np.where(tX_test[:,22]==1)[0]
two_three_indices = np.where(np.logical_or(tX_test[:,22]==2, tX_test[:,22]==3))[0]
tX_test_0 = tX_test[zero_indices, :]
tX_test_0 = np.delete(tX_test_0, 22, axis=1)
tX_test_1 = tX_test[one_indices, :]
tX_test_1 = np.delete(tX_test_1, 22, axis=1)
tX_test_2_3 = tX_test[two_three_indices, :]

### Adding a column of zeros and ones to detect whether the mass has been measured or not

In [56]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
zero_indices_0 = np.where(tX_test_0[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_0 else 1 for i in range(tX_test_0.shape[0])])
tX_test_0 = np.insert(tX_test_0, 0, column_to_add, axis=1)
zero_indices_1 = np.where(tX_test_1[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_1 else 1 for i in range(tX_test_1.shape[0])])
tX_test_1 = np.insert(tX_test_1, 0, column_to_add, axis=1)
zero_indices_2_3 = np.where(tX_test_2_3[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_2_3 else 1 for i in range(tX_test_2_3.shape[0])])
tX_test_2_3 = np.insert(tX_test_2_3, 0, column_to_add, axis=1)

### We drop the same columns we have dropped for the X training

In [57]:
tX_test_0 = np.delete(tX_test_0, col_to_delete_0, axis=1)
tX_test_1 = np.delete(tX_test_1, col_to_delete_1, axis=1)

### Now we substitute the -999 values with the median

In [58]:
for i in range(1, tX_test_2_3.shape[1]):
    index_column_non_valid =np.where(tX_test_2_3[:,i] == -999.)[0]
    index_column_valid =np.where(tX_test_2_3[:,i] != -999.)[0]
    median = np.median(tX_test_2_3[index_column_valid, i], axis = 0)
    tX_test_2_3[index_column_non_valid,i] =  median

In [59]:
for i in range(1, tX_test_1.shape[1]):
    index_column_non_valid =np.where(tX_test_1[:,i] == -999.)[0]
    index_column_valid =np.where(tX_test_1[:,i] != -999.)[0]
    median = np.median(tX_test_1[index_column_valid, i], axis = 0)
    tX_test_1[index_column_non_valid,i] =  median

In [60]:
for i in range(1, tX_test_0.shape[1]):
    index_column_non_valid =np.where(tX_test_0[:,i] == -999.)[0]
    index_column_valid =np.where(tX_test_0[:,i] != -999.)[0]
    median = np.median(tX_test_0[index_column_valid, i], axis = 0)
    tX_test_0[index_column_non_valid,i] =  median

### We standardize the test set using the mean and the standard deviation of the training

In [61]:
print(tX_test_0.shape)

(227458, 19)


In [62]:
print(tX_0.shape)

(99913, 19)


In [63]:
def standardize_test(x, mean, std):
    """Standardize the test set."""
    x = x - mean
    x = x / std
    return x

In [64]:
tX_test_0[:,1:] = standardize_test(tX_test_0[:,1:], mean_0, std_0)
tX_test_1[:,1:] = standardize_test(tX_test_1[:,1:], mean_1, std_1)
tX_test_2_3[:,1:]= standardize_test(tX_test_2_3[:,1:], mean_2_3, std_2_3) #we standardize everything a part from the column added manually

### We insert the column for the bias term

In [65]:
tX_tilda_test_0 = np.insert(tX_test_0, 0, np.ones(tX_test_0.shape[0]), axis=1)
tX_tilda_test_1 = np.insert(tX_test_1, 0, np.ones(tX_test_1.shape[0]), axis=1)
tX_tilda_test_2_3 = np.insert(tX_test_2_3, 0, np.ones(tX_test_2_3.shape[0]), axis=1)

### We make the predictions ridge

In [None]:
tX_tilda_test_2_3_augmented = build_poly(tX_tilda_test_2_3, degree=6)
predictions_2_3 = tX_tilda_test_2_3_augmented @ w_ridge_2_3
print(predictions_2_3.shape)

In [None]:
tX_tilda_test_0_augmented = build_poly(tX_tilda_test_0, degree = 6)
predictions_0 = tX_tilda_test_0_augmented @ w_ridge_0
print(predictions_0.shape)

In [None]:
tX_tilda_test_1_augmented = build_poly(tX_tilda_test_1, degree=6)
predictions_1 = tX_tilda_test_1_augmented @ w_ridge_1
print(predictions_1.shape)

In [None]:
print(len(zero_indices))
print(len(one_indices))
print(len(two_three_indices))
print()

Now we have to reconstruct a single vector of predictions

In [None]:
#jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
stacked_predictions = []
count_0 =0
count_1 =0
count_2_3 =0
for index_row in range(tX_test.shape[0]):
    if index_row in zero_indices:
        stacked_predictions.append(predictions_0[count_0])
        count_0 = count_0 + 1
    elif index_row in one_indices:
        stacked_predictions.append(predictions_1[count_1])
        count_1 = count_1 +1
    else:
        stacked_predictions.append(predictions_2_3[count_2_3])
        count_2_3 = count_2_3 +1

In [None]:
final_predictions = np.array([-1 if el <0.5 else 1 for el in stacked_predictions])

In [None]:
print(final_predictions)

### we make predictions GD

In [66]:
tX_tilda_test_2_3_augmented = build_poly(tX_tilda_test_2_3, degree=3)
predictions_2_3 = tX_tilda_test_2_3_augmented @ w_opt_training_2_3
print(predictions_2_3.shape)
tX_tilda_test_0_augmented = build_poly(tX_tilda_test_0, degree = 2)
predictions_0 = tX_tilda_test_0_augmented @ w_opt_training_0
print(predictions_0.shape)
tX_tilda_test_1_augmented = build_poly(tX_tilda_test_1, degree=3)
predictions_1 = tX_tilda_test_1_augmented @ w_opt_training_1
print(predictions_1.shape)

(165442,)
(227458,)
(175338,)


In [67]:
stacked_predictions = []
count_0 =0
count_1 =0
count_2_3 =0
for index_row in range(tX_test.shape[0]):
    if index_row in zero_indices:
        stacked_predictions.append(predictions_0[count_0])
        count_0 = count_0 + 1
    elif index_row in one_indices:
        stacked_predictions.append(predictions_1[count_1])
        count_1 = count_1 +1
    else:
        stacked_predictions.append(predictions_2_3[count_2_3])
        count_2_3 = count_2_3 +1

In [68]:
final_predictions = np.array([-1 if el <0.5 else 1 for el in stacked_predictions])

In [69]:
def predict_labels(weights, tX_test):
    y = np.array(tX_test) @ np.array(weights)
    labels = [1 if l > 0 else -1 for l in y]
    return labels

In [70]:
OUTPUT_PATH = 'submission_GD.csv' # TODO: fill in desired name of output file for submission
#y_pred = predict_labels(weights, tX_test)
y_pred = final_predictions
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [71]:
print(final_predictions)

[-1 -1 -1 ...  1 -1 -1]
