In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
import random
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # train data path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(tX.shape)

(250000, 30)


### Changing the labels to {0,1}

In [4]:
# change the array from -1 1 to 0 1
# label simplification
# y == 0 non detected Boson, y == 1 detected Boson
y_ = np.array([0 if l == -1 else 1 for l in y])

# All of the following parts are the same for the test set, should be functions

# less code = better code (in this case)

### Dividing the features by the number of jets

In [5]:
# dividing the rows of tX by the number of jets, dropping the column Pri_Jet_Num and adding an extra column of np.ones
zero_indices = []
one_indices = []
two_three_indices = []
zero_indices = np.where(tX[:,22]==0)[0]
one_indices = np.where(tX[:,22]==1)[0]
two_three_indices = np.where(np.logical_or(tX[:,22]==2, tX[:,22]==3))[0]
#Use indices to seperate the testing samples into the respective arrays
tX_0 = tX[zero_indices, :]
tX_0 = np.delete(tX_0, 22, axis=1)
tX_1 = tX[one_indices, :]
tX_1 = np.delete(tX_1, 22, axis=1)
tX_2_3 = tX[two_three_indices, :]

### Dividing also the output by the type of particle

In [6]:
y_0 = y_[zero_indices]
y_1 = y_[one_indices]
y_2_3 = y_[two_three_indices]

### Adding a column of zeros and ones to detect whether the mass has been measured or not

In [7]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
zero_indices_0 = np.where(tX_0[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_0 else 1 for i in range(tX_0.shape[0])])
tX_0 = np.insert(tX_0, 0, column_to_add, axis=1)
zero_indices_1 = np.where(tX_1[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_1 else 1 for i in range(tX_1.shape[0])])
tX_1 = np.insert(tX_1, 0, column_to_add, axis=1)
zero_indices_2_3 = np.where(tX_2_3[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_2_3 else 1 for i in range(tX_2_3.shape[0])])
tX_2_3 = np.insert(tX_2_3, 0, column_to_add, axis=1)

### Throwing away the outliers from the training data
Is this necessary? For logistic regression, should be done only for Linear regression, imo.

In [8]:
for i in range(1, tX_2_3.shape[1]):
    index_column_valid =np.where(tX_2_3[:,i] != -999.)[0] # cut out all the wrong measurements
    # take the center of the sampledistribution
    column_25_quantile, column_75_quantile = np.quantile(tX_2_3[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
    interquantile = column_75_quantile-column_25_quantile
    column_15_quantile, column_85_quantile = np.quantile(tX_2_3[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
    #determine the outliears given that they have a large divergance from the rest of the values
    indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_2_3[index_column_valid,i])
                                             | (tX_2_3[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
    #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
    #median of the valid indices
    median = np.median(tX_2_3[index_column_valid, i], axis = 0)
    #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
    #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
    #print(median)
    #replace the outliers with the calculated median
    tX_2_3[index_column_valid[indices_outliers],i] =  median

In [9]:
#apart from replacing the outliers in valid columns like in the 2_3 case we erase columns that are invalid
# invalid means that all their values are -999. since such columns have no meaning
#initialize the list of columns that need to be deleted
col_to_delete_0 = []
for i in range(1, tX_0.shape[1]):
    #check if the column has any valid indices 
    index_column_valid =np.where(tX_0[:,i] != -999.)[0]
    if len(index_column_valid)==0:
        #we drop the column (we will have to do the same for the test set as well)
        col_to_delete_0.append(i)
    else :
        column_25_quantile, column_75_quantile = np.quantile(tX_0[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
        interquantile = column_75_quantile-column_25_quantile
        column_15_quantile, column_85_quantile = np.quantile(tX_0[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
        indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_0[index_column_valid,i])
                                             | (tX_0[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
        #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
        median = np.median(tX_0[index_column_valid, i], axis = 0)
        #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
        #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
        #print(median)
        tX_0[index_column_valid[indices_outliers],i] =  median
col_to_delete_0.append(tX_0.shape[1]-1)
tX_0 = np.delete(tX_0, col_to_delete_0, axis=1)
print(tX_0.shape)
print(col_to_delete_0)

(99913, 19)
[5, 6, 7, 13, 23, 24, 25, 26, 27, 28, 29]


In [10]:
# we follow the same procedure as for the Pjet_In 0
col_to_delete_1 = []
for i in range(1, tX_1.shape[1]):
    index_column_valid =np.where(tX_1[:,i] != -999.)[0]
    if len(index_column_valid)==0:
        #we drop the column (we will have to do the same for the test set as well)
        col_to_delete_1.append(i)
    else :
        column_25_quantile, column_75_quantile = np.quantile(tX_1[index_column_valid,i], 
                                                         np.array([0.25, 0.75]))
        interquantile = column_75_quantile-column_25_quantile
        column_15_quantile, column_85_quantile = np.quantile(tX_1[index_column_valid,i], 
                                                         np.array([0.15, 0.85]))
        indices_outliers = np.where((column_15_quantile - 1.5 * interquantile >= tX_1[index_column_valid,i])
                                             | (tX_1[index_column_valid,i] >= 
                                                column_85_quantile + 1.5 * interquantile))[0]
        #indices_outliers = np.argwhere((tX_tilda_2_3[index_column_valid,i] >= column_85_quantile + 1.5 * interquantile) | 
                                  #(column_15_quantile - 1.5 * interquantile >= tX_tilda_2_3[index_column_valid,i]))
        median = np.median(tX_1[index_column_valid, i], axis = 0)
        #print(np.sort(tX_tilda_2_3[index_column_valid[indices_outliers],i]).T)
        #print(np.where(tX_tilda_2_3[indices_outliers,i])==-999.)
        #print(median)
        tX_1[index_column_valid[indices_outliers],i] =  median
tX_1 = np.delete(tX_1, col_to_delete_1, axis=1)
print(col_to_delete_1)

[5, 6, 7, 13, 26, 27, 28]


### Now we substitute the -999 values with the median

In [11]:
for i in range(1, tX_2_3.shape[1]):
    index_column_non_valid =np.where(tX_2_3[:,i] == -999.)[0] #find invalid indices
    index_column_valid =np.where(tX_2_3[:,i] != -999.)[0] #find valid indices
    median = np.median(tX_2_3[index_column_valid, i], axis = 0) #calculate the median of the valid indices
    tX_2_3[index_column_non_valid,i] =  median

In [12]:
for i in range(1, tX_1.shape[1]):
    index_column_non_valid =np.where(tX_1[:,i] == -999.)[0] #find invalid indices
    index_column_valid =np.where(tX_1[:,i] != -999.)[0] #find valid indices
    median = np.median(tX_1[index_column_valid, i], axis = 0) #calculate the median of the valid indices
    tX_1[index_column_non_valid,i] =  median

In [13]:
for i in range(1, tX_0.shape[1]):
    index_column_non_valid =np.where(tX_0[:,i] == -999.)[0] #find invalid indices
    index_column_valid =np.where(tX_0[:,i] != -999.)[0] #find valid indices
    median = np.median(tX_0[index_column_valid, i], axis = 0) #calculate the median of the valid indices
    tX_0[index_column_non_valid,i] =  median

### Now we standardize the data

In [14]:
tX_2_3[:,1:], mean_2_3,std_2_3 = standardize(tX_2_3[:,1:]) #we standardize everything a part from the column added manually

In [15]:
print(tX_2_3)
print(np.count_nonzero(tX_2_3 == -999.))

[[ 1.          0.97351249  0.46588281 ...  0.61614788 -1.36131161
  -0.70374641]
 [ 1.         -0.82851663 -0.77157038 ...  0.11608109  1.71034105
   0.2995537 ]
 [ 1.          1.3538447  -0.27431587 ...  0.07030726 -1.52202162
   0.12704911]
 ...
 [ 1.          0.04006392  0.02513449 ...  0.25930888  0.22982758
   0.42357227]
 [ 1.          0.66304099 -1.0843679  ...  0.29031696 -1.21821366
  -0.20699627]
 [ 1.          0.04006392  0.31977857 ... -0.02271698 -0.62490751
   0.05569682]]
0


In [16]:
print(tX_0)

[[ 1.00000e+00  1.43905e+02  8.14170e+01 ...  3.10820e+01  6.00000e-02
   8.60620e+01]
 [ 1.00000e+00  1.75864e+02  1.69150e+01 ...  2.72300e+00 -8.71000e-01
   5.31310e+01]
 [ 1.00000e+00  1.05594e+02  5.05590e+01 ...  3.77910e+01  2.40000e-02
   1.29804e+02]
 ...
 [ 1.00000e+00  1.11452e+02  5.81790e+01 ...  4.67370e+01 -8.67000e-01
   8.04080e+01]
 [ 1.00000e+00  9.49510e+01  1.93620e+01 ...  1.21500e+01  8.11000e-01
   1.12718e+02]
 [ 1.00000e+00  1.11452e+02  7.27560e+01 ...  4.07290e+01 -1.59600e+00
   9.94050e+01]]


In [17]:
tX_0[:,1:],mean_0,std_0 = standardize(tX_0[:,1:]) 

In [18]:
print(tX_0)

[[ 1.          1.05744907  0.7827665  ...  0.01825038  0.04662815
  -0.7724943 ]
 [ 1.          2.14505538 -1.37519852 ... -1.71504715 -0.4674532
  -1.44727052]
 [ 1.         -0.24632406 -0.2496121  ...  0.42830338  0.0267496
   0.12380588]
 ...
 [ 1.         -0.0469687   0.00532098 ...  0.97508147 -0.46524447
  -0.8883482 ]
 [ 1.         -0.60851918 -1.29333222 ... -1.13887042  0.46131675
  -0.22629665]
 [ 1.         -0.0469687   0.49300595 ...  0.60787347 -0.86778508
  -0.49908812]]


In [19]:
tX_1[:,1:],mean_1,std_1 = standardize(tX_1[:,1:])

In [20]:
print(tX_1)

[[ 1.00000000e+00  1.55539992e+00  7.27047143e-01 ...  3.98445313e-01
   6.45414781e-01 -4.14297220e-01]
 [ 1.00000000e+00  1.45598722e-03  3.58462301e+00 ...  1.12748232e+00
  -1.10752634e+00 -4.89864560e-01]
 [ 1.00000000e+00  1.36261180e+00 -1.05809645e+00 ... -3.92076740e-01
  -9.40265168e-01 -1.01072441e+00]
 ...
 [ 1.00000000e+00  1.45598722e-03  1.01732036e+00 ... -4.67286130e-01
  -3.80160315e-01  8.39087556e-01]
 [ 1.00000000e+00  6.75509966e-01  9.95415259e-01 ... -6.76994064e-01
   1.39533906e+00  5.32418071e-01]
 [ 1.00000000e+00 -2.21030015e-01  4.74893699e-01 ...  9.88591985e-01
  -8.30516498e-02 -5.76298292e-01]]


### We insert the column for the bias term

In [21]:
tX_tilda_0 = np.insert(tX_0, 0, np.ones(tX_0.shape[0]), axis=1)
tX_tilda_1 = np.insert(tX_1, 0, np.ones(tX_1.shape[0]), axis=1)
tX_tilda_2_3 = np.insert(tX_2_3, 0, np.ones(tX_2_3.shape[0]), axis=1)

In [22]:
print(tX_tilda_0)

[[ 1.          1.          1.05744907 ...  0.01825038  0.04662815
  -0.7724943 ]
 [ 1.          1.          2.14505538 ... -1.71504715 -0.4674532
  -1.44727052]
 [ 1.          1.         -0.24632406 ...  0.42830338  0.0267496
   0.12380588]
 ...
 [ 1.          1.         -0.0469687  ...  0.97508147 -0.46524447
  -0.8883482 ]
 [ 1.          1.         -0.60851918 ... -1.13887042  0.46131675
  -0.22629665]
 [ 1.          1.         -0.0469687  ...  0.60787347 -0.86778508
  -0.49908812]]


In [23]:
# colors = ['red', 'blue']
# x_pos=[]
# x_neg=[]

# for j in range(len(y)):
#  if(y[j]==1):
#       x_pos.insert(0,tX[j])
#    else:
#        x_neg.insert(0,tX[j])
# xpos = np.array(x_pos)
# xneg = np.array(x_neg)
# for i in range(tX.shape[1]):
#  plt.hist(xpos[:,i], alpha = 0.5, color = 'r', bins = 100)
#  plt.hist(xneg[:,i], alpha = 0.5, color = 'b', bins = 100)
#  plt.show()

## Do your thing crazy machine learning thing here :) ...

## The following functions have to do with the linear regression model

In [24]:
# compute the MSE Loss
def compute_loss(y, tx, w):
    N = y.shape[0] # N = Number of samples
    e = y - tx @ w # e = error vector (truth - prediction)
    loss = 1/(2*N) * np.dot(e,e) # calculate the average loss
    return loss

In [25]:
# compute the gradient for the MSE loss function
def compute_gradient(y, tx, w):
    N = y.shape[0] # N = Number of samples
    e = y - tx @ w # e = error vector (truth - prediction)
    gradient = -(1/N) * (tx.T) @ (e) # calculate the gradient
    return gradient

# Should we replace ws? We dont need it anymore and it is reduntant

# Less things allow for more readable code

In [26]:
# do the gradient descent algorithm
# max_iters = the maximum number of repetitions the algorithm is allowed to do 
# gamma = the step of the function in the direction of the gradient
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]# A list of all the weights
    losses = []                                                              
    w = initial_w # Initialization of the weights
    for n_iter in range(max_iters):
        loss = compute_loss(y,tx,w) # calculate the MSE loss
        gradient = compute_gradient(y,tx,w) # calculate the gradient 
        w = w - gamma * gradient # conduct a step of gradient descent
        ws.append(w) # append the current weight
        losses.append(loss) # compute the next loss
        # print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
        # bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return losses, ws

In [27]:
# Use cross validation to determine optimal data augmenttion
# cross validation function for gradient descent
# k-indices = random subsets of the original samples
# k the set that we will use as the test set out of the subsets
# degree = the degree up to which we will exponentiate each feature
def cross_validation_GD(y, x, k_indices, k, degree, gamma = 3.0e-02):
    N = y.shape[0] # N = number of samples
    k_fold = k_indices.shape[0] # number of seperated sets
    list_ = []
    interval = int(N/k_fold) # the length of each subset
    # this is not well written and should be changed it is very hard to understand
    # Create a list of the indices of subsets that are supposed to be used as a training set
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1]))
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    # get the testing set out of the remaining set
    x_testing = x[k_indices[k], :]
    y_testing = y[k_indices[k]]
    # augment the testing and training set feature vectors
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    #w_opt_training = ridge_regression(y_training, x_training_augmented, lambda_)
    # calculate the optimal weights
    _,  w_opt_training = least_squares_GD(y_training, x_training_augmented,
                                                        np.zeros(x_training_augmented.shape[1]), 1000, gamma)
    # calculate losses for the training and test set respectively and return them
    loss_tr = calculate_loss(y_training, x_training_augmented, w_opt_training[-1])
    loss_te = calculate_loss(y_testing, x_testing_augmented, w_opt_training[-1])
    return loss_tr, loss_te

In [28]:
# compute the stochastic gradient of a random sample
def compute_stoch_gradient(y, tx, w):
    N = y.shape[0]# number of samples
    random_number = random.randint(0,N)# generate random index
    xn = tx[random_number,:]# get sample of that index
    random_gradient = - np.dot(xn, y[random_number] - np.dot(xn,w))# calculate the stochastic gradient
    return random_gradient

In [29]:
# Implementation of the stochastic gradient descent algorithm
# max_iters = the maximum number of repetitions the algorithm is allowed to do 
# gamma = the step of the function in the direction of the gradient
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    ws = [initial_w]                                                      #initialize weight list
    losses = [] # initialize list of losses
    w = initial_w # intialize the weights for the first iteration
    for n_iter in range(max_iters):
        loss = compute_loss(y,tx,w) # compute MSE loss for all samples
        stoch_gradient = compute_stoch_gradient(y,tx,w) # calculate stochastic gradient
        w = w - gamma * stoch_gradient # update the weights using the stochastic gradient 
        ws.append(w) # append the next weight
        losses.append(loss) # append the current loss to the list
        # print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
        #    bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return losses, ws

In [30]:
from proj1_helpers import *

# least squares minimizer using normal equations.
def least_squares(y, tx):
    # calculate the forcing term and the coefficient matrix respectively
    forcing_term = np.transpose(tx) @ y 
    coefficient_matrix = np.transpose(tx) @ tx 
    
    w = np.linalg.solve(coefficient_matrix, forcing_term) # solve the linear equation for w
    return w

# do we need this?
def test_your_least_squares(y, tx):
    """compare the solution of the normal equations with the weights returned by gradient descent algorithm."""
    w_least_squares = least_squares(y, tx)
    initial_w = np.zeros(tx.shape[1])
    max_iters = 50
    gamma = 0.7
    losses_gradient_descent, w_gradient_descent = gradient_descent(y, tx, initial_w, max_iters, gamma)
    w = w_gradient_descent[-1]
    err = np.linalg.norm(w_least_squares-w)
    return err

In [31]:
# Use the ridge regression formula
def ridge_regression(y, tx, lambda_):
    N = tx.shape #get the dimensions of x
    lambda_prime = 2 * N[0] * lambda_ #calculate the new lambda of the gradient formula
    
    # calculate the coefficient matrix and the forcing term respectively
    coefficient_matrix = np.transpose(tx) @ tx + lambda_prime * np.eye(N[1]) 
    forcing_term = np.transpose(tx) @ y
    
    w = np.linalg.solve(coefficient_matrix, forcing_term) #calculate the w with the ridge normal equation
    return w

#do we need this?
def debug_ridge(y, tx):
    """debugging the ridge regression by setting lambda=0."""
    w_least_squares = least_squares(y, tx)
    w_0 = ridge_regression(y, tx, 0)
    err = np.linalg.norm(w_least_squares-w_0)
    return err

## The Following functions correspond to the logistic regression model

In [32]:
#commpute the sigmoid function for a vector t
def sigmoid(t):
    # seperate the indices where t is negative from the ones where t is non-negative
    positive_indices = np.where(t >= 0)[0]
    negative_indices = np.where(t < 0)[0]
    
    #calculate the output seperately for the positive values of t and for the negative ones
    # this is done in order to avoid numerical problems s.a. overflow,division by 0 e.t.c. 
    z = np.zeros(len(t))
    z[positive_indices] = 1 / (1+np.exp(-t[positive_indices]))
    z[negative_indices] = np.exp(t[negative_indices]) / (1 + np.exp(t[negative_indices]))
    return z

In [33]:
#calculate the loss for the losistic 
def calculate_loss(y, tx, w):
    #seperate the indices where the prediction is positive and where the prediction is negative 
    pos_ind = np.where(tx @ w >=0)[0]
    neg_ind = np.where(tx @ w <0)[0]
    #calculate the loss for both the positive and the negative indices.
    loss_pos = - y[pos_ind] * (tx @ w)[pos_ind] + (tx @ w)[pos_ind] + np.log(1+np.exp(-(tx @ w)[pos_ind]))# I am a little curious about th minus here
    loss_neg = - y[neg_ind] * (tx @ w)[neg_ind] - (tx @ w)[neg_ind] + np.log(1+np.exp((tx @ w)[neg_ind]))
    return loss_pos.sum() + loss_neg.sum()

In [34]:
# calculate the gradient for logistic regression
def calculate_gradient(y, tx, w):
    return np.transpose(tx) @ (sigmoid(tx @ w) - y)

In [35]:
# Use the gradient descent method for the 
def learning_by_gradient_descent(y, tx, w_initial, gamma, max_iters):
    losses = [] #list of losses
    w = w_initial #set initial weight for the first iteration
    for iter in range(max_iters):
        grad = calculate_gradient(y, tx, w) # calculate the gradient for a single iteration of the algorithm
        w = w - gamma * grad # upate the weights given the gradient and the step
        if iter %25 == 0:
            gamma = gamma/2 # decrease the gamma after some iterations pass to increase the accuracy
        loss = calculate_loss(y, tx, w) # calculate loss
        losses.append(loss) # append loss to the list of losses
    return losses, w

In [36]:
# not needed now right?
losses1, w1 = learning_by_gradient_descent(y_0, tX_tilda_0, np.zeros(tX_tilda_0.shape[1]), 0.8, 10000)
#print(losses1)

[5179248944.798143, 4548292501.191404, 3873264747.21586, 3204189882.428973, 2558766497.952185, 1925840421.0113559, 1358134428.651399, 806750192.3189511, 2000410682.432347, 1601256172.1407375, 2113524821.032508, 1606047298.1298578, 1990415844.7863722, 1640858641.9755054, 1934131711.4022388, 1285499978.8239312, 2039587553.5306728, 1427677293.0631826, 1887233645.3011854, 1447488537.8697572, 1922910135.0623963, 1269966212.241833, 1955256377.1864235, 1400327379.5162978, 1845506548.8768008, 1264731822.7937446, 1931658652.6046593, 1314650368.8541703, 1862692520.7874825, 1330058160.1396334, 1877959970.337535, 1229045186.8782659, 1902392318.7812443, 1308156948.8224394, 1844261214.6759772, 1241157611.7495239, 1894851670.650667, 1255749330.3854282, 1860264536.3924847, 1274839497.8584266, 1859682719.8045282, 1218658685.3278322, 1880581668.5445318, 1258713077.5692606, 1850203797.983286, 1235051327.6527336, 1873359564.5719733, 1230356581.4292586, 1859083288.6187131, 1244041475.3110273, 1856766909.60

In [37]:
# I believe that the second order is used nowhere and should be removed since it is also arithmetically unstable
def calculate_hessian(y, tx, w):
    """return the Hessian of the loss function."""
    diag = sigmoid(tx @ w) * (1 - sigmoid(tx @ w))
    D = diag * np.eye(tx.shape[0])
    return np.transpose(tx) @ D @ tx

In [38]:
#calculate the values of the logistic regression
def logistic_regression(y, tx, w):
    grad = calculate_gradient(y, tx, w) #calculate the gradient
    hess = calculate_hessian(y, tx, w) #
    loss = calculate_loss(y, tx, w) # calcualte the loss
    return loss, grad, hess

In [39]:
# Should this be removed as well since we only have to use th gradient method
def learning_by_newton_method(y, tx, w, gamma):
    loss, grad, hess = logistic_regression(y, tx, w)
    sol = np.linalg.solve(hess, grad)
    w = w - gamma * sol
    return loss, w

In [40]:
# Compute the logistic regression with an L2 regularizer
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss, gradient"""
    loss = calculate_loss(y, tx, w) + lambda_*np.linalg.norm(w) ** 2
    grad = calculate_gradient(y, tx, w) + 2*lambda_*w
    hess = calculate_hessian(y, tx, w) + 2*lambda_*np.eye(w.shape[0])
    return loss, grad, hess

In [41]:
# Execute one step of regularized logistic regression
def learning_by_penalized_gradient(y, tx, w_initial, gamma, max_iters, lambda_):
    threshold = 1e-8 # threshold to stop execution
    losses = []
    w = w_initial # initialize weights for the first iteration of the algorithm
    for iter in range(max_iters):
        grad = calculate_gradient(y, tx, w) + 2*lambda_*w # calcualte gradient
        w = w - gamma * grad # update the weights given the gradient and the step
        loss = calculate_loss(y, tx, w) + lambda_*np.linalg.norm(w) ** 2 # compute loss
        losses.append(loss) # append loss to the list of losses
        if iter % 25 == 0:
            gamma = gamma / 2 # decrease gamma after some iterations to increase the fitting of the parameters 
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    return losses, w

In [42]:
# augment the feauture vector X
def build_poly(x, degree):
    powers = np.arange(1, degree + 1) #degrees of polynomial 
    phi = np.column_stack([np.power(x[:,0], exponent) for exponent in powers]) #extend the vector by putting the power
                                                                               # of each feature of each sample
    for i in range(1, x.shape[1]):
        phi_i = np.column_stack([np.power(x[:,i], exponent) for exponent in powers]) 
        phi = np.column_stack([phi, phi_i])
    return phi

In [43]:
# build the k vector of shuffled indices
def build_k_indices(y, k_fold, seed):
    N = y.shape[0] # number of samples
    np.random.seed(seed) # initialize random seed
    interval = int(np.floor(N / k_fold)) # number of samples in each subset
    indices = np.random.permutation(N) # return an array of rnadomly order indices in [0,N)
    k_indices = [indices[k * interval: (k + 1) * interval] 
                 for k in range(k_fold)] # crete the sub arrays of indices in k-intervals
    return np.array(k_indices)

In [44]:
# cross validation function for ridge regression
# k-indices = random subsets of the original samples
# k the set that we will use as the test set out of the subsets
# lambda_ = the lambda for the regularization function
# degree = the degree up to which we will exponentiate each feature
def cross_validation_ridge(y, x, k_indices, k, lambda_, degree):
    N = y.shape[0] # number of samples 
    k_fold = k_indices.shape[0] # number of seperated sets
    list_ = []
    interval = int(N/k_fold) # the length of each subset
    # this is not well written and should be changed it is very hard to understand
    # Create a list of the indices of subsets that are supposed to be used as a training set
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    # create the training set out of these indices
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1])) 
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    # get the testing set out of the remaining set
    x_testing = x[k_indices[k], :]
    y_testing = y[k_indices[k]]
    # augment the testing and training set feature vectors
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    # get optimal weights
    w_opt_training = ridge_regression(y_training, x_training_augmented, lambda_)
    # calculate losses for the training and test set respectively and return them
    loss_tr = compute_loss(y_training, x_training_augmented, w_opt_training)
    loss_te = compute_loss(y_testing, x_testing_augmented, w_opt_training)
    return loss_tr, loss_te

## The following cells of code aim at finetuning the hyperparameters of the code
Each cell finetunes the hyperparameters for each case where Pri_Jet_Num is 0 or 1 or 2_3 so they are identical apart from the change in this parameters.
## Create one fucntion and call it thrice

In [45]:
degrees = np.arange(2, 7) # create array of degrees
lambdas = np.logspace(-5,0,15) #create array of lambdas
k_fold = 5 # number of subsets to split the training set into
seed = 1 # initialise the seed for the randomizer
training_loss = np.zeros((len(lambdas), len(degrees)))# initial 2-d array for the grid search or the lambads and the degrees
testing_loss = np.zeros((len(lambdas), len(degrees))) # initial 2-d array for the grid search or the lambads and the degrees
k_indices = build_k_indices(y_0, k_fold, seed) #create the subarrays for the cross_validation
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0 # initialize the training loss for each repetition
        test_loss = 0 # initialize the test loss for each repetition
        
        #run the cross validation for each possible split into test-train
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_0, tX_tilda_0, k_indices, k,
                                                lambdas[index1], degrees[index2])
            train_loss += loss_tr# increase the training loss for this execution
            test_loss += loss_te# increase the test loss for this eecution
        training_loss[index1, index2] = train_loss / k_fold # save the average of the training loss
        testing_loss[index1, index2] = test_loss / k_fold # save the average of the testing loss
best_result = np.where(testing_loss == np.amin(testing_loss)) # get the optimal index for the hyper parameters
print(testing_loss)
# get and print the optimal values
lambda_opt, degree_opt = lambdas[best_result[0]],degrees[best_result[1]] 
print(lambda_opt, degree_opt) 

[[0.06229007 0.06054168 0.05997957 0.05969034 0.05950786]
 [0.06229007 0.06054167 0.05997956 0.05969032 0.05950778]
 [0.06229006 0.06054165 0.05997954 0.05969026 0.05950758]
 [0.06229003 0.06054161 0.0599795  0.05969015 0.05950716]
 [0.06228999 0.06054157 0.05997947 0.05968998 0.05950642]
 [0.06228999 0.06054165 0.05997968 0.05969003 0.05950573]
 [0.06229041 0.06054265 0.05998142 0.05969194 0.05950806]
 [0.06229306 0.06054829 0.05999042 0.05970297 0.05952563]
 [0.06230481 0.06057272 0.06002762 0.05974795 0.05959274]
 [0.06234758 0.06065782 0.06015417 0.0598943  0.05977394]
 [0.06248079 0.06089653 0.06050353 0.06027633 0.0601499 ]
 [0.06283664 0.0614592  0.06125489 0.06105903 0.06079715]
 [0.06371731 0.06264497 0.0624934  0.06231165 0.06185065]
 [0.06585003 0.0648964  0.06417547 0.06401799 0.06359233]
 [0.07042634 0.06869078 0.06642577 0.0662836  0.06617371]]
[0.00061054] [6]


In [46]:
degrees = np.arange(2, 7)
lambdas = np.logspace(-5,0,15)
k_fold = 5
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_1, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0
        test_loss = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_1, tX_tilda_1, k_indices, k, 
                                                lambdas[index1], degrees[index2])
            train_loss += loss_tr
            test_loss += loss_te
        training_loss[index1, index2] = train_loss / k_fold
        testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
print(testing_loss)
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(lambda_opt, degree_opt)

[[0.08022969 0.07472501 0.07384662 0.07246893 0.07232976]
 [0.08022968 0.07472499 0.0738466  0.0724689  0.07232973]
 [0.08022967 0.07472497 0.07384655 0.07246883 0.07232964]
 [0.08022965 0.07472492 0.07384647 0.07246878 0.07232975]
 [0.08022963 0.07472489 0.07384646 0.07246911 0.07233081]
 [0.08022972 0.07472521 0.07384724 0.07247147 0.0723349 ]
 [0.08023057 0.07472777 0.07385246 0.07248172 0.07234905]
 [0.08023556 0.07474199 0.07387736 0.07252124 0.07240095]
 [0.08025961 0.07480703 0.07397579 0.07266608 0.07257992]
 [0.08035669 0.07505001 0.07430176 0.07312224 0.0730842 ]
 [0.08067315 0.0757488  0.07515953 0.07424121 0.07419037]
 [0.08150696 0.07726097 0.07689265 0.07633714 0.07609521]
 [0.08339738 0.0799469  0.07970283 0.07941736 0.07874686]
 [0.08726569 0.08438512 0.08365394 0.08323829 0.08203457]
 [0.09413339 0.09118387 0.08869685 0.08773981 0.08631216]]
[5.17947468e-05] [6]


In [None]:
degrees = np.arange(2, 7)
lambdas = np.logspace(-5,0,15)
k_fold = 5
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_2_3, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0
        test_loss = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_ridge(y_2_3, tX_tilda_2_3, k_indices, k,
                                            lambdas[index1], degrees[index2])
            train_loss += loss_tr
            test_loss += loss_te
        training_loss[index1, index2] = train_loss / k_fold
        testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(testing_loss)
print(lambda_opt, degree_opt) 

We train the model for ridge regression with the best hyperparameters

In [None]:
#Augment the feauture vector and calculate the optimal weights for ridge regression
# for Pjet_In 0
tX_tilda_0_augmented = build_poly(tX_tilda_0, degree = 6)
w_ridge_0 = ridge_regression(y_0, tX_tilda_0_augmented, lambda_= 0.00061054)
#print(w_ridge_0)

In [None]:
# for Pjet_In 1
tX_tilda_1_augmented = build_poly(tX_tilda_1, degree=6)
w_ridge_1 = ridge_regression(y_1, tX_tilda_1_augmented, lambda_= 5.17947468e-05)
#print(w_ridge_1)

In [None]:
# for Pjet_In 2_3
tX_tilda_2_3_augmented = build_poly(tX_tilda_2_3, degree=6)
w_ridge_2_3 = ridge_regression(y_2_3, tX_tilda_2_3_augmented, lambda_= 0.00026827)
#print(w_ridge_2_3)

We will now try with logistic regression

In [None]:
# cross validation function for logistic regression
# k-indices = random subsets of the original samples
# k the set that we will use as the test set out of the subsets
# lambda_ = the lambda for the regularization function
# degree = the degree up to which we will exponentiate each feature
def cross_validation_logistic(y, x, k_indices, k, lambda_, degree, gamma = 3.0e-02):
    """return the loss of ridge regression."""
    N = y.shape[0] # the number of features
    k_fold = k_indices.shape[0]
    list_ = []
    interval = int(N/k_fold) # the length of each subset
    # this is not well written and should be changed it is very hard to understand
    # Create a list of the indices of subsets that are supposed to be used as a training set
    for i in range(k_fold):
        if i != k:
            list_.append(i)
    x_training = np.zeros((int((k_fold-1)/k_fold*N), x.shape[1]))
    y_training = np.zeros(int((k_fold-1)/k_fold*N))
    for j in range(len(list_)):
        x_training[interval*(j):interval*(j+1), :] = x[np.array([k_indices[list_[j]]]), :]
    for j in range(len(list_)):
        y_training[interval*(j):interval*(j+1)] = y[np.array([k_indices[list_[j]]])]
    # get the testing set out of the remaining set
    x_testing = x[k_indices[k], :]
    y_testing = y[k_indices[k]]
    # augment the testing and training set feature vectors
    x_training_augmented = build_poly(x_training, degree)
    x_testing_augmented = build_poly(x_testing, degree)
    #w_opt_training = ridge_regression(y_training, x_training_augmented, lambda_)
    # get optimal weights
    _,  w_opt_training = learning_by_penalized_gradient(y_training, x_training_augmented,
                                                        np.ones(x_training_augmented.shape[1]), gamma, 1000, lambda_)
    # calculate losses for the training and test set respectively and return them
    loss_tr = calculate_loss(y_training, x_training_augmented, w_opt_training)
    loss_te = calculate_loss(y_testing, x_testing_augmented, w_opt_training)
    return loss_tr, loss_te

We perform cross validation in order to find the best parameters degree, lamdba and gamma caracterizing logistic regression

In [None]:
print(tX_tilda_0)

In [None]:
degrees = np.arange(2, 6) # create array of degrees
lambdas = np.logspace(-2, 0, 5)#create array of lambdas
gamma = 3.0e-02 # learning rate
#Why 3 and not 5?
k_fold = 3 # number of subsets to split the training set into
seed = 1 
training_loss = np.zeros((len(lambdas), len(degrees)))# initial 2-d array for the grid search or the lambdas and the degrees
testing_loss = np.zeros((len(lambdas), len(degrees)))# initial 2-d array for the grid search or the lambdas and the degrees
k_indices = build_k_indices(y_0, k_fold, seed)#create the subarrays for the cross_validation
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
        train_loss = 0 # initialize the training loss for each repetition
        test_loss = 0 # initialize the test loss for each repetition
        #run the cross validation for each possible split into test-train
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation_logistic(y_0, tX_tilda_0, k_indices, k,
                                                lambdas[index1], degrees[index2], gamma)
        train_loss += loss_tr # increase the training loss for this execution
        test_loss += loss_te # increase the test loss for this execution
        training_loss[index1, index2] = train_loss / k_fold # save the average of the training loss
        testing_loss[index1, index2] = test_loss / k_fold # save the average of the testing loss
best_result = np.where(testing_loss == np.amin(testing_loss)) # get the optimal index for the hyper parameters
# get and print the optimal values
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(lambda_opt, degree_opt)

In [None]:
print(np.logspace(-2,0,5))

In [None]:
degrees = np.arange(2, 6)
lambdas = np.logspace(-5, 0, 5)
gamma = 3.0e-02
k_fold = 3
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_1, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
            train_loss = 0
            test_loss = 0
            for k in range(k_fold):
                loss_tr, loss_te = cross_validation_logistic(y_1, tX_tilda_1, k_indices, k,
                                                lambdas[index1], degrees[index2], gamma)
                train_loss += loss_tr
                test_loss += loss_te
            training_loss[index1, index2] = train_loss / k_fold
            testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt = lambdas[best_result[0]], degrees[best_result[1]]
print(lambda_opt, degree_opt)

In [None]:
degrees = np.arange(2, 6)
lambdas = np.logspace(-5, 0, 5)
gamma = 3.0e-02
k_fold = 3
seed = 1
training_loss = np.zeros((len(lambdas), len(degrees)))
testing_loss = np.zeros((len(lambdas), len(degrees)))
k_indices = build_k_indices(y_2_3, k_fold, seed)
for index1 in range(len(lambdas)):
    for index2 in range(len(degrees)):
            train_loss = 0
            test_loss = 0
            for k in range(k_fold):
                loss_tr, loss_te = cross_validation_logistic(y_2_3, tX_tilda_2_3, k_indices, k,
                                                lambdas[index1], degrees[index2], gamma)
                train_loss += loss_tr
                test_loss += loss_te
            training_loss[index1, index2] = train_loss / k_fold
            testing_loss[index1, index2] = test_loss / k_fold
best_result = np.where(testing_loss == np.amin(testing_loss))
lambda_opt, degree_opt = lambdas[best_result[0]],degrees[best_result[1]]
print(lambda_opt, degree_opt)

In [None]:
#Augment the feauture vector and calculate the optimal weights for logistic regression
# for Pjet_In 0
tX_tilda_0_augmented = build_poly(tX_tilda_0, degree = 2)
_, w_logistic_0 = learning_by_penalized_gradient(y_0, tX_tilda_0_augmented, np.zeros(tX_tilda_0_augmented.shape[1]), 3.0e-02,
                                              1000, lambda_= 1)

In [None]:
# for Pjet_In 1
tX_tilda_1_augmented = build_poly(tX_tilda_1, degree = 2)
_, w_logistic_1 = learning_by_penalized_gradient(y_1, tX_tilda_1_augmented, np.zeros(tX_tilda_1_augmented.shape[1]), 3.0e-02,
                                              1000, lambda_= 1)

In [None]:
# for Pjet_In 2_3
tX_tilda_2_3_augmented = build_poly(tX_tilda_2_3, degree = 2)
_, w_logistic_2_3 = learning_by_penalized_gradient(y_2_3, tX_tilda_2_3_augmented, np.zeros(tX_tilda_2_3_augmented.shape[1]),
                                                3.0e-02, 1000, lambda_= 1)

### Generate predictions and save ouput in csv format for submission:

In [None]:
# open the test file
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
print(tX_test.shape)

We will now format the tX_test as we did for tX_train

### we split the test into the three subgroups

In [None]:
# create index lists of elemets with 0,1, 2_3 Pjet_In value
zero_indices = []
one_indices = []
two_three_indices = []
zero_indices = np.where(tX_test[:,22]==0)[0]
one_indices = np.where(tX_test[:,22]==1)[0]
two_three_indices = np.where(np.logical_or(tX_test[:,22]==2, tX_test[:,22]==3))[0]
#Use indices to seperate the testing samples into the respective arrays
tX_test_0 = tX_test[zero_indices, :]
tX_test_0 = np.delete(tX_test_0, 22, axis=1)
tX_test_1 = tX_test[one_indices, :]
tX_test_1 = np.delete(tX_test_1, 22, axis=1)
tX_test_2_3 = tX_test[two_three_indices, :]

### Adding a column of zeros and ones to detect whether the mass has been measured or not
This should be done prior to splitting it is the same procedure and just wastes space 

In [None]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
zero_indices_0 = np.where(tX_test_0[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_0 else 1 for i in range(tX_test_0.shape[0])])
tX_test_0 = np.insert(tX_test_0, 0, column_to_add, axis=1)
zero_indices_1 = np.where(tX_test_1[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_1 else 1 for i in range(tX_test_1.shape[0])])
tX_test_1 = np.insert(tX_test_1, 0, column_to_add, axis=1)
zero_indices_2_3 = np.where(tX_test_2_3[:,1] == -999.)[0]
column_to_add = np.array([0 if i in zero_indices_2_3 else 1 for i in range(tX_test_2_3.shape[0])])
tX_test_2_3 = np.insert(tX_test_2_3, 0, column_to_add, axis=1)

### We drop the same columns we have dropped for the X training

In [None]:
tX_test_0 = np.delete(tX_test_0, col_to_delete_0, axis=1)
tX_test_1 = np.delete(tX_test_1, col_to_delete_1, axis=1)

### Now we substitute the -999 values with the median
This should also be done with a function it is the same thing repeated thrice

In [None]:
for i in range(1, tX_test_2_3.shape[1]):
    index_column_non_valid =np.where(tX_test_2_3[:,i] == -999.)[0] # calculate invalid indices 
    index_column_valid =np.where(tX_test_2_3[:,i] != -999.)[0] # calculate valid indices
    median = np.median(tX_test_2_3[index_column_valid, i], axis = 0) # calculate the median
    tX_test_2_3[index_column_non_valid,i] =  median # substitute the invalid values with the median

In [None]:
for i in range(1, tX_test_1.shape[1]):
    index_column_non_valid =np.where(tX_test_1[:,i] == -999.)[0] # calculate invalid indices
    index_column_valid =np.where(tX_test_1[:,i] != -999.)[0] # calculate valid indices
    median = np.median(tX_test_1[index_column_valid, i], axis = 0) # calculate the median
    tX_test_1[index_column_non_valid,i] =  median # substitute the invalid values with the median

In [None]:
for i in range(1, tX_test_0.shape[1]):
    index_column_non_valid =np.where(tX_test_0[:,i] == -999.)[0] # calculate invalid indices
    index_column_valid =np.where(tX_test_0[:,i] != -999.)[0] # calculate valid indices
    median = np.median(tX_test_0[index_column_valid, i], axis = 0) # calculate the median
    tX_test_0[index_column_non_valid,i] =  median # substitute the invalid values with the median

### We standardize the test set using the mean and the standard deviation of the training

In [None]:
print(tX_test_0.shape)

In [None]:
print(tX_0.shape)

In [None]:
# standardize the data in the test set
# should have used the same function both here and on the training part same process this is reduntant
def standardize_test(x, mean, std):
    """Standardize the test set."""
    x = x - mean 
    x = x / std
    return x

In [None]:
tX_test_0[:,1:] = standardize_test(tX_test_0[:,1:], mean_0, std_0)  #we standardize everything a part from the column added manually
tX_test_1[:,1:] = standardize_test(tX_test_1[:,1:], mean_1, std_1)  #we standardize everything a part from the column added manually
tX_test_2_3[:,1:]= standardize_test(tX_test_2_3[:,1:], mean_2_3, std_2_3) #we standardize everything a part from the column added manually

### We insert the column for the bias term

In [None]:
tX_tilda_test_0 = np.insert(tX_test_0, 0, np.ones(tX_test_0.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_1 = np.insert(tX_test_1, 0, np.ones(tX_test_1.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_2_3 = np.insert(tX_test_2_3, 0, np.ones(tX_test_2_3.shape[0]), axis=1) #the first column now is all ones and is used for bias

### We make the predictions

In [None]:
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_2_3_augmented = build_poly(tX_tilda_test_2_3, degree=6)
# make the predictions with the augmented test set and ridge resgression
predictions_ridge_2_3 = tX_tilda_test_2_3_augmented @ w_ridge_2_3
# print(predictions_2_3.shape)

In [None]:
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_0_augmented = build_poly(tX_tilda_test_0, degree = 6)
# make the predictions with the augmented test set and ridge regression
predictions_ridge_0 = tX_tilda_test_0_augmented @ w_ridge_0
# print(predictions_0.shape)

In [None]:
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_1_augmented = build_poly(tX_tilda_test_1, degree = 6)
# make the predictions with the augmented test set and ridge regression
predictions_ridge_1 = tX_tilda_test_1_augmented @ w_ridge_1
# print(predictions_1.shape)

In [None]:
print(len(zero_indices))
print(len(one_indices))
print(len(two_three_indices))
print()

In [None]:
### Predictions with logistic regression

In [None]:
# make the predictions with the augmented test set
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_2_3_augmented = build_poly(tX_tilda_test_2_3, degree = 2)
# make the predictions with the augmented test set and logistic regression
predictions_logistic_2_3 = sigmoid(tX_tilda_test_2_3_augmented @ w_logistic_2_3)
# print(predictions_2_3.shape)

In [None]:
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_1_augmented = build_poly(tX_tilda_test_1, degree = 2)
# make the predictions with the augmented test set and logistic regression
predictions_logistic_1 = sigmoid(tX_tilda_test_1_augmented @ w_logistic_1)
# print(predictions_1.shape)

In [None]:
#since we trained the model in augmented data, we augment the test set
tX_tilda_test_0_augmented = build_poly(tX_tilda_test_0, degree = 2)
# make the predictions with the augmented test set and logistic regression
predictions_logistic_0 = sigmoid(tX_tilda_test_0_augmented @ w_logistic_0)
# print(predictions_0.shape)

Now we have to reconstruct a single vector of predictions

In [None]:
#jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
# create a list with all of the predictions stacked from the 3 case from 0,1,2_3
# also the counts seem to be reduntant if so please remove them
# it should also be done into a function with the shell below to be called for all models arbitrarily
stacked_predictions = []
count_0 = 0
count_1 = 0
count_2_3 = 0
for index_row in range(tX_test.shape[0]):
    if index_row in zero_indices:
        stacked_predictions.append(predictions_logistic_0[count_0])
        count_0 = count_0 + 1
    elif index_row in one_indices:
        stacked_predictions.append(predictions_logistic_1[count_1])
        count_1 = count_1 +1
    else:
        stacked_predictions.append(predictions_logistic_2_3[count_2_3])
        count_2_3 = count_2_3 + 1

In [None]:
# apply a decision boundary to generate -1 or 1 ou of the results of the model
# transform the list to np array
# parameter 0.5 needs to be finetuned for optimal performance
final_predictions = np.array([-1 if el < 0.5 else 1 for el in stacked_predictions])

In [None]:
print(final_predictions)

In [None]:
# this does not seem to be used and should be removed in such a case
def predict_labels(weights, tX_test):
    y = np.array(tX_test) @ np.array(weights)
    labels = [1 if l > 0 else -1 for l in y]
    return labels

In [None]:
# I changed the output path so that all the data will be in the same file
OUTPUT_PATH = '../data/submission.csv' # name towards the destination file to be written 
#y_pred = predict_labels(weights, tX_test)
y_pred = final_predictions # seems reduntant 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH) # print csv file according to results