# Python notebook used to tune the model for the Higgs Boson Challenge

#### EPFL - Machine Learning - Autumn 2019

## 1) Header

In [77]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from proj1_helpers import *
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Randomisation

In [2]:
seed=374534
np.random.seed(seed)

In [71]:
e = np.random.random((8,6))
print(e)
meaningfull_ind=(e>0.9*np.ones((8,6))).any(1)
print(meaningfull_ind)
e[meaningfull_ind]

[[0.23676309 0.6913665  0.72123764 0.77355978 0.29272284 0.0921879 ]
 [0.13874637 0.9578856  0.76160123 0.89739508 0.8193797  0.40526392]
 [0.33890697 0.82765583 0.27155213 0.08125309 0.10757594 0.43955623]
 [0.24514069 0.45463584 0.39520304 0.63212849 0.64717466 0.99200813]
 [0.18115957 0.30649788 0.12005232 0.81795219 0.55313438 0.34055915]
 [0.79159044 0.26880083 0.5878979  0.03571236 0.24928637 0.29290182]
 [0.98049819 0.11542089 0.28694477 0.49493764 0.11096529 0.27820023]
 [0.00483833 0.99375106 0.48376429 0.65549112 0.1059759  0.95543885]]
[False  True False  True False False  True  True]


array([[0.13874637, 0.9578856 , 0.76160123, 0.89739508, 0.8193797 ,
        0.40526392],
       [0.24514069, 0.45463584, 0.39520304, 0.63212849, 0.64717466,
        0.99200813],
       [0.98049819, 0.11542089, 0.28694477, 0.49493764, 0.11096529,
        0.27820023],
       [0.00483833, 0.99375106, 0.48376429, 0.65549112, 0.1059759 ,
        0.95543885]])

## 2) Data loading

In [3]:
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 

y_train_raw, x_train_raw, ids_train = load_csv_data(DATA_TRAIN_PATH,sub_sample=False)
_, x_test_raw, ids_test = load_csv_data(DATA_TEST_PATH)

In [42]:
y_train = np.copy(y_train_raw)
x_train = np.copy(x_train_raw)
x_test = np.copy(x_test_raw)

print("The dimensions of x_train are ",x_train.shape)
print("The dimension of y_train is ",y_train.shape)
print("The dimension of ids_train is ",ids_train.shape, "\n")
print("The dimensions of x_test are ",x_test.shape)
print("The dimension of ids_test is ",ids_test.shape)

meaningfull_ind=(x_train!=-999).all(1)
print(meaningfull_ind)

The dimensions of x_train are  (250000, 30)
The dimension of y_train is  (250000,)
The dimension of ids_train is  (68114,) 

The dimensions of x_test are  (568238, 30)
The dimension of ids_test is  (568238,)
[ True False False ... False False False]


## 3) Data preprocessing

In [46]:
y_train, x_train, ids_train = data_preprocessing(y_train, x_train, ids_train,"discard")
_, x_test, ids_test = data_preprocessing(_, x_test, ids_test,"zero")


print("The dimensions of x_train are ",x_train.shape)
print("The dimension of y_train is ",y_train.shape)
print("The dimension of ids_train is ",ids_train.shape, "\n")
print("The dimensions of x_test are ",x_test.shape)
print("The dimension of ids_test is ",ids_test.shape)


The dimensions of x_train are  (250000, 30)
The dimension of y_train is  (250000,)
The dimension of ids_train is  (68114,) 

The dimensions of x_test are  (568238, 30)
The dimension of ids_test is  (568238,)


## 4) Model tuning

In [82]:
degrees = [8]
k_fold = 3
lambdas = np.logspace(-10, -5, 5)

for degree in degrees:
    cross_validation_demo(y_train, x_train, k_fold, lambdas, degree, 657)

KeyboardInterrupt: 

## 5) Prediction of the test data labels

In [53]:
degree=9
minusloglambda=2
lambda_=10**(-loglambda)
y_tr_pd, y_te_pd = prediction(x_train, y_train, x_test, degree, lambda_)
name="submission_{0}_{1}.csv".format(degree,minusloglambda)
create_csv_submission(ids_test, y_te_pd, name)

The train data accuracy of the model is  0.8246322341956133 
The train data f1 score of the model is  0.8158142260188426
