In [14]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
import random
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Our libraries
from proj1_helpers import *
from proj1_input_man import *
from proj1_linear_model import *
from proj1_ridge_regress import *
from proj1_logistic import *

## Load the training data into feature matrix, class labels, and event ids:

In [16]:
DATA_TRAIN_PATH = '../data/train.csv' # train data path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [17]:
print(tX.shape)

(250000, 30)


### Changing the labels to {0,1}

In [18]:
# change the array from -1 1 to 0 1
# label simplification
# y == 0 non detected Boson, y == 1 detected Boson
y_ = np.array([0 if l == -1 else 1 for l in y])

# All of the following parts are the same for the test set, should be functions

# less code = better code (in this case)

### Dividing the features by the number of jets

In [19]:
tX_0,tX_1,tX_2_3 = split_to_Jet_Num(tX)

### Dividing also the output by the type of particle

In [20]:
y_0,y_1,y_2_3 = split_labels_to_Jet_Num(y_,tX)

### Adding a column of zeros and ones to detect whether the mass has been measured or not

In [21]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
tX_0 = find_mass(tX_0)
tX_1 = find_mass(tX_1)
tX_2_3 = find_mass(tX_2_3)

### Throwing away the outliers from the training data

In [22]:
tX_0 = fix_array(tX_0,0)
print(tX_0.shape)
tX_1 = fix_array(tX_1,1)
print(tX_1.shape)
tX_2_3 = fix_array(tX_2_3)
print(tX_2_3.shape)

[5, 6, 7, 13, 23, 24, 25, 26, 27, 28, 29]
(99913, 19)
(99913, 19)
[5, 6, 7, 13, 26, 27, 28]
(77544, 23)
(77544, 23)
(72543, 31)
(72543, 31)


### Now we substitute the -999 values with the median

In [23]:
tX_0 = fix_mean(tX_0)
tX_1 = fix_mean(tX_1)
tX_2_3 = fix_mean(tX_2_3)

### Now we standardize the data

In [24]:
tX_2_3[:,1:], mean_2_3,std_2_3 = standardize(tX_2_3[:,1:]) #we standardize everything a part from the column added manually
tX_0[:,1:],mean_0,std_0 = standardize(tX_0[:,1:])
tX_1[:,1:],mean_1,std_1 = standardize(tX_1[:,1:])

### We insert the column for the bias term

In [25]:
tX_tilda_0 = np.insert(tX_0, 0, np.ones(tX_0.shape[0]), axis=1)
tX_tilda_1 = np.insert(tX_1, 0, np.ones(tX_1.shape[0]), axis=1)
tX_tilda_2_3 = np.insert(tX_2_3, 0, np.ones(tX_2_3.shape[0]), axis=1)

## Do your thing crazy machine learning thing here :) ...

## The following cells of code aim at finetuning the hyperparameters of the code

### Calculate the optimal degree for gradient descent

In [27]:
degree_opt_0_GD = finetune_GD(tX_tilda_0,y_0)
degree_opt_1_GD = finetune_GD(tX_tilda_1,y_1)
degree_opt_2_3_GD = finetune_GD(tX_tilda_2_3,y_2_3)

[ 0.61847606  0.06020022  0.006644   ...  0.5805158   0.28062325
 -0.01899064]
[1 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[ 0.15864768 -0.03560335  0.20053426 ...  0.608775   -0.01836511
  0.03172052]
[0 0 0 ... 1 0 0]
[1 0 0 ... 1 0 0]
[ 0.26937014 -0.00129739  0.03183365 ...  0.67090778  0.36748447
  0.81767221]
[0 0 0 ... 1 0 1]
[0 0 0 ... 1 0 1]
[0.11828094 0.27527221 0.22687442 ... 0.43692667 0.2788262  0.20996465]
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 1 1]
[ 0.82239773 -0.01319973  0.0410209  ...  0.55932944  0.23118963
 -0.00646542]
[1 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[ 0.47410793 -0.00442422  0.19485567 ...  0.63209243  0.13035331
  0.03630025]
[0 0 0 ... 1 0 0]
[1 0 0 ... 1 0 0]
[-0.16364379  0.00518823  0.01600962 ...  0.83116894  0.39815189
  0.8920483 ]
[0 0 0 ... 1 0 1]
[0 0 0 ... 1 0 1]


KeyboardInterrupt: 

In [None]:
# This cell takes a long time to execute, the hyperparameters found are:
# degree =2 for tX_tilda_0 with an accuracy of 0.82694771 on the validation set
# degree =3 for tX_tilda_1 with an accuracy of 0.7937687 on the validation set
# degree =3 for tX_tilda_2_3 with an accuracy of 0.81856907 on the validation set

### Calculate the optimal lambda for ridge regression and the optimal degree for feature augmentation

In [None]:
lambda_opt_0,degree_opt_0 = finetune_ridge(tX_tilda_0,y_0)
lambda_opt_1,degree_opt_1 = finetune_ridge(tX_tilda_1,y_1)
lambda_opt_2_3,degree_opt_2_3 = finetune_ridge(tX_tilda_2_3,y_2_3)

### Get the optimal weights with the calculated hyper parameters 

In [None]:
w_ridge_0 = optimal_weights_ridge(tX_tilda_0,y_0,degree_opt_0,lambda_opt_0)
w_ridge_1 = optimal_weights_ridge(tX_tilda_1,y_1,degree_opt_1,lambda_opt_1)
w_ridge_2_3 = optimal_weights_ridge(tX_tilda_2_3,y_2_3,degree_opt_2_3,lambda_opt_2_3)

### Calculate the optimal lambda for ridge regression and the optimal degree for feature augmentation

In [None]:
lambda_opt_0,degree_opt_0 = finetune_logistic(tX_tilda_0,y_0)
lambda_opt_1,degree_opt_1 = finetune_logistic(tX_tilda_1,y_1)
lambda_opt_2_3,degree_opt_2_3 = finetune_logistic(tX_tilda_2_3,y_2_3)

### Get the optimal weights with the calculated hyper parameters 

In [None]:
w_logistic_0 = optimal_weights_logistic(tX_tilda_0,y_0,degree_opt_0,lambda_opt_0)
w_logistic_1 = optimal_weights_logistic(tX_tilda_1,y_1,degree_opt_1,lambda_opt_1)
w_logistic_2_3 = optimal_weights_logistic(tX_tilda_2_3,y_2_3,degree_opt_2_3,lambda_opt_2_3)

### Generate predictions and save ouput in csv format for submission:

In [None]:
# open the test file
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
print(tX_test.shape)

We will now format the tX_test as we did for tX_train

### we split the test into the three subgroups

In [None]:
tX_test_0,tX_test_1,tX_test_2_3 = split_to_Jet_Num(tX_test)

### Adding a column of zeros and ones to detect whether the mass has been measured or not
This should be done prior to splitting it is the same procedure and just wastes space 

In [None]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
tX_test_0 = find_mass(tX_test_0)
tX_test_1 = find_mass(tX_test_1)
tX_test_2_3 = find_mass(tX_test_2_3)

### We drop the same columns we have dropped for the X training

In [None]:
tX_test_0 = fix_array(tX_test_0,0)
print(tX_test_0.shape)
tX_test_1 = fix_array(tX_test_1,1)
print(tX_test_1.shape)

### Now we substitute the -999 values with the median
This should also be done with a function it is the same thing repeated thrice

In [None]:
tX_0 = fix_mean(tX_0)
tX_1 = fix_mean(tX_1)
tX_2_3 = fix_mean(tX_2_3)

### We standardize the test set using the mean and the standard deviation of the training

In [None]:
print(tX_test_0.shape)

In [None]:
print(tX_0.shape)

In [None]:
# standardize the data in the test set
# should have used the same function both here and on the training part same process this is reduntant
def standardize_test(x, mean, std):
    """Standardize the test set."""
    x = x - mean 
    x = x / std
    return x

In [None]:
tX_test_0[:,1:] = standardize_test(tX_test_0[:,1:], mean_0, std_0)  #we standardize everything a part from the column added manually
tX_test_1[:,1:] = standardize_test(tX_test_1[:,1:], mean_1, std_1)  #we standardize everything a part from the column added manually
tX_test_2_3[:,1:]= standardize_test(tX_test_2_3[:,1:], mean_2_3, std_2_3) #we standardize everything a part from the column added manually

### We insert the column for the bias term

In [None]:
tX_tilda_test_0 = np.insert(tX_test_0, 0, np.ones(tX_test_0.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_1 = np.insert(tX_test_1, 0, np.ones(tX_test_1.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_2_3 = np.insert(tX_test_2_3, 0, np.ones(tX_test_2_3.shape[0]), axis=1) #the first column now is all ones and is used for bias

### We make the predictions

In [None]:
predictions_ridge_0 = predict_ridge(tX_tilda_test_0,w_ridge_0,6)
predictions_ridge_1 = predict_ridge(tX_tilda_test_1,w_ridge_1,6)
predictions_ridge_2_3 = predict_ridge(tX_tilda_test_2_3,w_ridge_2_3,6)

### Predictions with logistic regression

In [None]:
predictions_logistic_0 = predict_logistic(tX_tilda_test_0,w_logistic_0,2)
predictions_logistic_1 = predict_logistic(tX_tilda_test_1,w_logistic_1,2)
predictions_logistic_2_3 = predict_logistic(tX_tilda_test_2_3,w_logistic_2_3,2)

Now we have to reconstruct a single vector of predictions

### Generate final prediction list and print it in the file

In [None]:
final_predictions_ridge = create_output(tX_test,predictions_ridge_0,predictions_ridge_1,predictions_ridge_2_3)

In [None]:
final_predictions_logistic = create_output(tX_test,predictions_logistic_0,predictions_logistic_1,predictions_logistic_2_3)

In [None]:
print(final_predictions_ridge)

In [None]:
# this does not seem to be used and should be removed in such a case
#def predict_labels(weights, tX_test):
#    y = np.array(tX_test) @ np.array(weights)
#    labels = [1 if l > 0 else -1 for l in y]
#    return labels

In [None]:
OUTPUT_PATH_RIDGE = '../data/submission_ridge.csv' # name towards ridge output 
OUTPUT_PATH_LOGISTIC = '../data/submission_logistic.csv' # name towards logistic output
#y_pred = predict_labels(weights, tX_test)
#y_pred = final_predictions_ridge # seems reduntant 
create_csv_submission(ids_test, final_predictions_ridge, OUTPUT_PATH_RIDGE) # print csv file according to results
create_csv_submission(ids_test, final_predictions_logistic, OUTPUT_PATH_LOGISTIC)