In [73]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
import random
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
# Our libraries
from proj1_helpers import *
from proj1_input_man import *
from proj1_linear_model import *
from proj1_ridge_regress import *
from proj1_logistic import *

## Load the training data into feature matrix, class labels, and event ids:

In [27]:
DATA_TRAIN_PATH = '../data/train.csv' # train data path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [28]:
print(tX.shape)

(250000, 30)


### Changing the labels to {0,1}

In [29]:
# change the array from -1 1 to 0 1
# label simplification
# y == 0 non detected Boson, y == 1 detected Boson
y_ = np.array([0 if l == -1 else 1 for l in y])

# All of the following parts are the same for the test set, should be functions

# less code = better code (in this case)

### Dividing the features by the number of jets

In [6]:
tX_0, tX_1, tX_2_3 = split_to_Jet_Num(tX)

In [30]:
tX_0, tX_1, tX_2, tX_3 = alternative_split_to_Jet_Num(tX)

In [8]:
for i in range(tX_1.shape[1]):
    print(np.std(tX_3[:,i]))

288.1303027432567
38.07417940661156
48.78977837574543
83.81996788992959
1.4698926707450346
317.2726868423743
2.8672697710964288
0.8272411967910458
32.29848326337556
150.13499581029197
1.2471331404825152
0.9433373230483033
0.3876270487619129
32.43010841851103
1.2047095881208543
1.8241134002031094
32.96582686964663
1.2176561938554824
1.8180600465624461
49.697288657467055
1.8123415757149903
158.8254499542401
76.3120342898988
1.5732280712204414
1.8067332213365404
40.12542350677477
1.775313755049286
1.8147406038684843
128.45133643867


### Dividing also the output by the type of particle

In [9]:
y_0,y_1,y_2_3 = split_labels_to_Jet_Num(y_,tX)

In [31]:
y_0, y_1, y_2, y_3 = alternative_split_labels_to_Jet_Num(y_, tX)

### Adding a column of zeros and ones to detect whether the mass has been measured or not

In [32]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
tX_0 = find_mass(tX_0)
tX_1 = find_mass(tX_1)
# tX_2_3 = find_mass(tX_2_3)
tX_2 = find_mass(tX_2)
tX_3 = find_mass(tX_3)

### Throwing away the outliers from the training data

In [33]:
tX_0, col_to_delete_0 = fix_array(tX_0, 0)
print(tX_0.shape)
tX_1, col_to_delete_1 = fix_array(tX_1, 1)
print(tX_1.shape)
tX_2, _ = fix_array(tX_2, 2)
tX_3, _ = fix_array(tX_3, 3)
print(tX_2)
print(tX_3)
# print(tX_2_3.shape)

[5, 6, 7, 13, 23, 24, 25, 26, 27, 28, 29]
(99913, 19)
[5, 6, 7, 13, 26, 27, 28]
(77544, 23)
[]
[]
[[ 1.00000e+00  1.38470e+02  5.16550e+01 ...  1.24000e+00 -2.47500e+00
   1.13497e+02]
 [ 1.00000e+00  1.48754e+02  2.88620e+01 ...  1.31000e-01 -2.76700e+00
   1.79877e+02]
 [ 1.00000e+00  1.41481e+02  7.36000e-01 ... -7.98000e-01 -2.78500e+00
   2.78009e+02]
 ...
 [ 1.00000e+00  1.19934e+02  2.00780e+01 ... -1.72500e+00 -2.75600e+00
   1.12938e+02]
 [ 1.00000e+00  1.26151e+02  2.90230e+01 ... -5.99000e-01 -2.52500e+00
   1.93099e+02]
 [ 1.00000e+00  1.15254e+02  4.71560e+01 ... -5.80000e-02 -1.13700e+00
   1.74176e+02]]
[[ 1.00000e+00  8.97440e+01  1.35500e+01 ...  2.24000e-01  3.10600e+00
   1.93660e+02]
 [ 1.00000e+00  1.14744e+02  1.02860e+01 ...  1.77300e+00 -2.07900e+00
   1.65640e+02]
 [ 1.00000e+00  1.21681e+02  6.04100e+00 ... -1.25700e+00 -6.09000e-01
   2.53461e+02]
 ...
 [ 1.00000e+00 -9.99000e+02  8.38710e+01 ...  3.07000e+00  1.61200e+00
   2.71833e+02]
 [ 1.00000e+00 -9.990

### Now we substitute the -999 values with the median

In [34]:
tX_0, column_median_0 = fix_median(tX_0)
tX_1, column_median_1 = fix_median(tX_1)
# tX_2_3 = fix_median(tX_2_3)
tX_2, column_median_2 = fix_median(tX_2)
tX_3, column_median_3 = fix_median(tX_3)

### Now we standardize the data

In [35]:
# tX_2_3[:,1:], mean_2_3, std_2_3 = standardize(tX_2_3[:,1:])
tX_3[:,1:], mean_3, std_3 = standardize(tX_3[:,1:])
tX_2[:,1:], mean_2, std_2 = standardize(tX_2[:,1:]) #we standardize everything a part from the column added manually
tX_0[:,1:], mean_0, std_0 = standardize(tX_0[:,1:])
tX_1[:,1:], mean_1, std_1 = standardize(tX_1[:,1:])

### We insert the column for the bias term

In [36]:
tX_tilda_0 = np.insert(tX_0, 0, np.ones(tX_0.shape[0]), axis=1)
tX_tilda_1 = np.insert(tX_1, 0, np.ones(tX_1.shape[0]), axis=1)
# tX_tilda_2_3 = np.insert(tX_2_3, 0, np.ones(tX_2_3.shape[0]), axis=1)
tX_tilda_2 = np.insert(tX_2, 0, np.ones(tX_2.shape[0]), axis=1)
tX_tilda_3 = np.insert(tX_3, 0, np.ones(tX_3.shape[0]), axis=1)

## Do your thing crazy machine learning thing here :) ...

## The following cells of code aim at finetuning the hyperparameters of the code

### Calculate the optimal degree for gradient descent

In [16]:
degree_opt_0_GD = finetune_GD(tX_tilda_0, y_0)
degree_opt_1_GD = finetune_GD(tX_tilda_1, y_1)
degree_opt_2_3_GD = finetune_GD(tX_tilda_2_3, y_2_3)

KeyboardInterrupt: 

In [None]:
degree_opt_1_GD = finetune_GD(tX_tilda_1, y_1)

In [None]:
# This cell takes a long time to execute, the hyperparameters found are:
# degree =2 for tX_tilda_0 with an accuracy of 0.82694771 on the validation set
# degree =3 for tX_tilda_1 with an accuracy of 0.7937687 on the validation set
# degree =3 for tX_tilda_2_3 with an accuracy of 0.81856907 on the validation set

In [None]:
w_GD_0 = optimal_weights_GD(tX_tilda_0,y_0,degree_opt_0_GD,lambda_opt_0_GD)
w_GD_1 = optimal_weights_GD(tX_tilda_1,y_1,degree_opt_1_GD,lambda_opt_1_GD)
w_GD_2_3 = optimal_weights_GD(tX_tilda_2_3,y_2_3,degree_opt_2_3_GD,lambda_opt_2_3_GD)

### Calculate the optimal degree for stochastic gradient descent for feature augmentation

In [None]:
degree_opt_0_SGD = finetune_SGD(tX_tilda_0, y_0)
degree_opt_1_SGD = finetune_SGD(tX_tilda_1, y_1)
degree_opt_2_SGD = finetune_SGD(tX_tilda_2, y_2)
degree_opt_3_SGD = finetune_SGD(tX_tilda_3, y_3)

### Calculate the optimal lambda for ridge regression and the optimal degree for feature augmentation

In [17]:
lambda_opt_0_ridge, degree_opt_0_ridge = finetune_ridge(tX_tilda_0, y_0, lambdas = random_interval(0.001, 0.1, 10))
lambda_opt_1_ridge, degree_opt_1_ridge = finetune_ridge(tX_tilda_1, y_1, lambdas = random_interval(0.001, 0.1, 10))
lambda_opt_2_ridge, degree_opt_2_ridge = finetune_ridge(tX_tilda_2, y_2, lambdas = random_interval(10e-6, 10e-3, 10))
lambda_opt_3_ridge, degree_opt_3_ridge = finetune_ridge(tX_tilda_3, y_3, lambdas = random_interval(10e-6, 10e-3, 10))

[[0.82453208 0.82971675 0.83003703 0.83069763 0.83225903]
 [0.82674407 0.83309979 0.83450105 0.83571214 0.83584226]
 [0.8267741  0.83317986 0.83463117 0.83598238 0.83606246]
 [0.82385147 0.82861575 0.82860575 0.82975678 0.831108  ]
 [0.82386148 0.8285557  0.82854569 0.82957662 0.83100791]
 [0.82619357 0.83258933 0.83333    0.83472125 0.83492143]
 [0.82753478 0.83435092 0.83613252 0.83751376 0.83768392]
 [0.82670403 0.83314983 0.83462116 0.83584226 0.83601241]
 [0.82399159 0.82879592 0.82897608 0.83000701 0.83127815]
 [0.82492243 0.8302272  0.83026724 0.83119808 0.83253929]]
[0.00774608] [6]
[[0.76926747 0.78622646 0.78794171 0.7898633  0.79083054]
 [0.77648955 0.79468661 0.79745938 0.79971628 0.80032241]
 [0.77695383 0.79518958 0.79802682 0.80030952 0.80060614]
 [0.76648182 0.78403405 0.78503998 0.78678102 0.78828991]
 [0.76632706 0.78407274 0.78492391 0.78672943 0.78823833]
 [0.7740521  0.79275213 0.79527986 0.797382   0.79789786]
 [0.7778179  0.79740779 0.79969048 0.80255352 0.803133

In [None]:
lambda_opt_1_ridge, degree_opt_1_ridge = finetune_ridge(tX_tilda_1, y_1, lambdas = random_interval(0.001, 0.1, 10))

In [None]:
# to check
print(lambda_opt_0_ridge, degree_opt_0_ridge)
print(lambda_opt_1_ridge, degree_opt_1_ridge)
# print(lambda_opt_2_3_ridge, degree_opt_2_3_ridge)
print(lambda_opt_2_ridge, degree_opt_2_ridge)
print(lambda_opt_3_ridge, degree_opt_3_ridge)

In [None]:
#Summarizing the best parameters found are
# lambda = 0.00316228 and degree=6 for tX_tilda_0 with a validation accuracy of 0.83847463
# [0.00558459] [6] gives accuracy = 0.83814433 for tX_tilda_0
# lambda = 0.00316228 and degree= 6 for tX_tilda_1 with a validation accuracy of 0.80354656
# lambda = 0.0047263 and degree = 6 for tX_tilda_3 gives 0.83759025 approximately
# lambda = 0.00546996 and degree = 5 for tX_tilda_2 gives 0.82747395
# lambda=1e-5 and degree=6 for tX_tilda_2_3 with a validation accuracy of 0.83021781

### Get the optimal weights with the calculated hyper parameters 

In [37]:
# w_ridge_0 = optimal_weights_ridge(tX_tilda_0, y_0, degree_opt_0_ridge, lambda_opt_0_ridge)
# w_ridge_1 = optimal_weights_ridge(tX_tilda_1, y_1, degree_opt_1_ridge, lambda_opt_1_ridge)
w_ridge_1 = optimal_weights_ridge(tX_tilda_1, y_1, 6, 0.00774608)
# w_ridge_2_3 = optimal_weights_ridge(tX_tilda_2_3, y_2_3, degree_opt_2_3_ridge, lambda_opt_2_3_ridge)
# w_ridge_2 = optimal_weights_ridge(tX_tilda_2, y_2, degree_opt_2_ridge, lambda_opt_2_ridge)
# w_ridge_2_3 = optimal_weights_ridge(tX_tilda_3, y_3, degree_opt_3_ridge, lambda_opt_3_ridge)

### Calculate the optimal lambda for logistic regression and the optimal degree for feature augmentation

In [38]:
lambda_opt_0_logistic, degree_opt_0_logistic = finetune_logistic(tX_tilda_0, y_0, gamma = 1.7783e-04, degrees =np.arange(1, 4) , lambdas=np.logspace(-2, 1, 7) )
lambda_opt_1_logistic, degree_opt_1_logistic = finetune_logistic(tX_tilda_1, y_1, gamma = 3.16228e-03, degrees = np.arange(1, 4), lambdas = np.logspace(-2, 1, 7) )
# lambda_opt_2_3_logistic,degree_opt_2_3_logistic = finetune_logistic(tX_tilda_2_3, y_2_3, gamma =0.00017783, degrees = np.arange(1, 3), lambdas = np.logspace(-2, 1, 7) )
lambda_opt_2_logistic, degree_opt_2_logistic = finetune_logistic(tX_tilda_2, y_2, gamma = 1.7783e-04, degrees = np.arange(1, 4), lambdas = np.logspace(-3, 1, 7))
lambda_opt_3_logistic, degree_opt_3_logistic = finetune_logistic(tX_tilda_3, y_3, gamma = 1.7783e-04, degrees = np.arange(1, 4), lambdas = np.logspace(-3, 1, 7))   

[10.] [2]
0.8266374409480343
[1.] [2]
0.79204064789023
[10.] [2]
0.8153088772431316
[0.001] [2]
0.8231366179390002


In [20]:
lambda_opt_3_logistic, degree_opt_3_logistic = finetune_logistic(tX_tilda_3, y_3, gamma = 1.7783e-04, degrees = np.arange(1, 4), lambdas = np.logspace(-3, 1, 7))   

[0.001] [2]
0.8231366179390002


In [None]:
lambda_opt_1_logistic,degree_opt_1_logistic = finetune_logistic(tX_tilda_1, y_1, gamma = 1.7783e-04, degrees = np.arange(1, 4), lambdas = random_interval(1.0e-07, 1.0e-02, 4))

In [None]:
#Summarizing the best parameters found are
# lambda=1.95096 and degree=2 for tX_tilda_0 with a validation accuracy of 0.827668 #lambda =9.64683289, acc=0.82664
# lambda=1.6672 and degree=2 for tX_tilda_1 with a validation accuracy of 0.79229 #lamda = 13.00571 acc = 0.792466 gamma = 0.0004
# lambda=0.6835 and degree=2 for tX_tilda_2_3 with a validation accuracy of 0.8151
# lambda = 10 and degree = 2 for tX_tilda_2 gives an accuracy 0.8153088 
# lambda = 0.001 and degree = 2 for tX_tilda_3 gives an accuracy 0.8231366 

### Get the optimal weights with the calculated hyper parameters 

In [61]:
gamma = 0.00017783
w_logistic_0 = optimal_weights_logistic(tX_tilda_0, y_0, gamma, degree_opt_0_logistic, lambda_opt_0_logistic)
w_logistic_1 = optimal_weights_logistic(tX_tilda_1, y_1, gamma, degree_opt_1_logistic, lambda_opt_1_logistic)
w_logistic_2 = optimal_weights_logistic(tX_tilda_2, y_2, gamma, degree_opt_2_logistic, lambda_opt_2_logistic)
w_logistic_3 = optimal_weights_logistic(tX_tilda_3, y_3, gamma, degree_opt_3_logistic, lambda_opt_3_logistic)

### calculate the optimal lambda for logistic regression with batch

In [None]:
lambda_opt_0_logistic_batch,degree_opt_0_logistic_batch = finetune_batch_logistic(tX_tilda_0, y_0, gamma = 1.7783e-04, degrees =np.arange(1, 4) , lambdas=random_interval(0.8, 1.5, 5) )

# Generate predictions and save ouput in csv format for submission:

In [42]:
# open the test file
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [43]:
print(tX_test.shape)

(568238, 30)


We will now format the tX_test as we did for tX_train

### we split the test into the three subgroups

In [44]:
tX_test_0, tX_test_1, tX_test_2, tX_test_3 = alternative_split_to_Jet_Num(tX_test)

### Adding a column of zeros and ones to detect whether the mass has been measured or not
This should be done prior to splitting it is the same procedure and just wastes space 

In [45]:
# take the indices where the mass is not calculated, add the column which has 0 in those indices
# and 1 everywhere else for all matrices 0,1,2_3
tX_test_0 = find_mass(tX_test_0)
tX_test_1 = find_mass(tX_test_1)
tX_test_2 = find_mass(tX_test_2)
tX_test_3 = find_mass(tX_test_3)

### We drop the same columns we have dropped for the X training

In [None]:
# tX_test_0 = fix_array(tX_test_0, 0)
# print(tX_test_0.shape)
# tX_test_1 = fix_array(tX_test_1, 1)
# print(tX_test_1.shape)

In [46]:
tX_test_0 = np.delete(tX_test_0, col_to_delete_0, axis=1)
tX_test_1 = np.delete(tX_test_1, col_to_delete_1, axis=1)

### Now we substitute the -999 values with the median
This should also be done with a function it is the same thing repeated thrice

In [47]:
tX_test_0 = fix_median_test(tX_test_0, column_median_0)
tX_test_1 = fix_median_test(tX_test_1, column_median_1)
tX_test_2 = fix_median_test(tX_test_2, column_median_2)
tX_test_3 = fix_median_test(tX_test_3, column_median_3)

In [63]:
print(tX_test_0.shape)
print(w_logistic_0.shape)

(227458, 19)
(39,)


### We standardize the test set using the mean and the standard deviation of the training

In [None]:
print(tX_test_0.shape)

In [None]:
print(tX_0.shape)

In [64]:
# standardize the data in the test set
# should have used the same function both here and on the training part same process this is reduntant
def standardize_test(x, mean, std):
    """Standardize the test set."""
    x = x - mean 
    x = x / std
    return x

In [65]:
tX_test_0[:,1:] = standardize_test(tX_test_0[:,1:], mean_0, std_0)  #we standardize everything a part from the column added manually
tX_test_1[:,1:] = standardize_test(tX_test_1[:,1:], mean_1, std_1)  #we standardize everything a part from the column added manually
tX_test_2[:,1:] = standardize_test(tX_test_2[:,1:], mean_2, std_2) #we standardize everything a part from the column added manually
tX_test_3[:,1:] = standardize_test(tX_test_3[:,1:], mean_3, std_3)

### We insert the column for the bias term

In [66]:
tX_tilda_test_0 = np.insert(tX_test_0, 0, np.ones(tX_test_0.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_1 = np.insert(tX_test_1, 0, np.ones(tX_test_1.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_2 = np.insert(tX_test_2, 0, np.ones(tX_test_2.shape[0]), axis=1) #the first column now is all ones and is used for bias
tX_tilda_test_3 = np.insert(tX_test_3, 0, np.ones(tX_test_3.shape[0]), axis=1)

### We make the predictions with GD

In [None]:
predictions_GD_0 = predict_GD(tX_tilda_test_0,w_GD_0,degree_opt_0_GD)
predictions_GD_1 = predict_GD(tX_tilda_test_1,w_GD_1,degree_opt_1_GD)
predictions_GD_2_3 = predict_GD(tX_tilda_test_2_3,w_GD_2_3,degree_opt_2_3_GD)

### We make the predictions with ridge regression

In [67]:
# predictions_ridge_0 = predict_ridge(tX_tilda_test_0, w_ridge_0,degree_opt_0_ridge)
# predictions_ridge_1 = predict_ridge(tX_tilda_test_1, w_ridge_1,degree_opt_1_ridge)
predictions_ridge_1 = predict_ridge(tX_tilda_test_1, w_ridge_1, 6)
# predictions_ridge_2_3 = predict_ridge(tX_tilda_test_2_3, w_ridge_2_3, degree_opt_2_3_ridge)
# predictions_ridge_2 = predict_ridge(tX_tilda_test_2, w_ridge_2, degree_opt_2_ridge)
# predictions_ridge_2 = predict_ridge(tX_tilda_test_3, w_ridge_3, degree_opt_3_ridge)

### Predictions with logistic regression

In [83]:
predictions_logistic_0 = predict_logistic(tX_tilda_test_0, w_logistic_0, degree_opt_0_logistic)
predictions_logistic_1 = predict_logistic(tX_tilda_test_1, w_logistic_1, degree_opt_1_logistic)
predictions_logistic_2 = predict_logistic(tX_tilda_test_2, w_logistic_2, degree_opt_2_logistic)
predictions_logistic_3 = predict_logistic(tX_tilda_test_3, w_logistic_3, degree_opt_3_logistic)

Now we have to reconstruct a single vector of predictions

### Generate final prediction list and print it in the file

In [None]:
# final_predictions_GD = create_output(tX_test,predictions_GD_0,predictions_GD_1,predictions_GD_2_3)

In [None]:
# final_predictions_ridge = create_output(tX_test,predictions_ridge_0,predictions_ridge_1,predictions_ridge_2_3)

In [None]:
# final_predictions_logistic = create_output(tX_test,predictions_logistic_0,predictions_logistic_1,predictions_logistic_2_3)

In [86]:
final_mixed_predictions = create_output(tX_test, predictions_logistic_0, predictions_logistic_1, predictions_logistic_2, predictions_logistic_3)

In [87]:
print(final_mixed_predictions)
print(1 / len(final_mixed_predictions) * np.count_nonzero(final_mixed_predictions == 1))

[-1 -1 -1 ... -1 -1 -1]
5.279477965218799e-06


In [None]:
# this does not seem to be used and should be removed in such a case
#def predict_labels(weights, tX_test):
#    y = np.array(tX_test) @ np.array(weights)
#    labels = [1 if l > 0 else -1 for l in y]
#    return labels

In [79]:
OUTPUT_PATH_GD = '../data/submission_GD.csv' # name towards GD output 
OUTPUT_PATH_RIDGE = '../data/submission_ridge.csv' # name towards ridge output 
OUTPUT_PATH_LOGISTIC = '../data/submission_logistic.csv' # name towards logistic output 
#create_csv_submission(ids_test, final_predictions_GD, OUTPUT_PATH_GD) # print csv file according to results
# create_csv_submission(ids_test, final_predictions_ridge, OUTPUT_PATH_RIDGE) # print csv file according to results
#create_csv_submission(ids_test, final_predictions_logistic, OUTPUT_PATH_LOGISTIC)
create_csv_submission(ids_test, final_mixed_predictions, OUTPUT_PATH_LOGISTIC)