# Python notebook used to tune the model for the Higgs Boson Challenge

#### EPFL - Machine Learning - Autumn 2019

## 1) Header

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from proj1_helpers import *
from implementations import *
from helpers import *

%load_ext autoreload
%autoreload 2

#### Randomisation

In [2]:
seed=374534
np.random.seed(seed)

## 2) Data loading

In [3]:
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 

y_train_raw, x_train_raw, ids_train = load_csv_data(DATA_TRAIN_PATH,sub_sample=False)
_, x_test_raw, ids_test = load_csv_data(DATA_TEST_PATH)

In [4]:
y_train = np.copy(y_train_raw)
x_train = np.copy(x_train_raw)
x_test = np.copy(x_test_raw)

print("The dimensions of x_train are ",x_train.shape)
print("The dimension of y_train is ",y_train.shape)
print("The dimension of ids_train is ",ids_train.shape, "\n")
print("The dimensions of x_test are ",x_test.shape)
print("The dimension of ids_test is ",ids_test.shape)

The dimensions of x_train are  (250000, 30)
The dimension of y_train is  (250000,)
The dimension of ids_train is  (250000,) 

The dimensions of x_test are  (568238, 30)
The dimension of ids_test is  (568238,)


## 3) Functions needed to perform the model tuning

Skip to part 4) for the actual model tuning

In [5]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold cross-validation."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def cross_validation(y, x, k_indices, k, lambda_, degrees):
    """return the accuracy of ridge regression."""
    y_test=y[k_indices[k,:]]
    x_test=x[k_indices[k,:]]   
    y_train=np.delete(y,k)
    x_train=np.delete(x,k,0)
    
    y_pred_train, y_pred_test = prediction(x_train, y_train, x_test, degrees, lambda_)
    accuracy_train, F1_train = check_accuracy(y_pred_train, y_train)
    accuracy_test, F1_test = check_accuracy(y_pred_test, y_test)
    return accuracy_train, accuracy_test, F1_train, F1_test

def cross_validation_visualization(lambdas, acc_tr, acc_te, f1_tr, f1_te):
    """visualization of the accuracy and the f1 score for the train data and the test data."""
    fig = plt.figure()
    fig.set_size_inches(12,4)
    ax_acc = fig.add_subplot(1, 2, 1)
    ax_f1 = fig.add_subplot(1, 2, 2)
    
    ax_acc.set_xlabel('lambda')
    ax_acc.set_ylabel('accuracy')
    ax_acc.semilogx(lambdas, acc_tr, marker=".", color='b', label='train accuracy')
    ax_acc.semilogx(lambdas, acc_te, marker=".", color='r', label='test accuracy')
    ax_acc.set_title('Accuracy')           
    ax_acc.grid(True)
    ax_acc.legend(loc=2)
    
    ax_f1.set_xlabel('lambda')
    ax_f1.set_ylabel('f1 score')
    ax_f1.semilogx(lambdas, f1_tr, marker=".", color='b', label='train f1 score')
    ax_f1.semilogx(lambdas, f1_te, marker=".", color='r', label='test f1 score')
    ax_f1.set_title('F1 score')           
    ax_f1.grid(True)
    ax_f1.legend(loc=2)
    
    fig.savefig('cross_validation')


def cross_validation_demo(y, x, k_fold, lambdas, degrees,seed=1):
    """to do"""
    k_indices = build_k_indices(y, k_fold,seed)
    acc_tr = []
    acc_te = []
    f1_tr = []
    f1_te = []
    for lambda_ in lambdas:
        acc_tr_lambda=0;
        acc_te_lambda=0;
        f1_tr_lambda=0;
        f1_te_lambda=0;
        for k in range(k_fold):
            accuracy_train, accuracy_test, f1_train, f1_test = cross_validation(y, x, k_indices, k, lambda_, degrees)
            
            acc_tr_lambda += accuracy_train/k_fold
            acc_te_lambda += accuracy_test/k_fold
            f1_tr_lambda += f1_train/k_fold
            f1_te_lambda += f1_test/k_fold
            
        acc_tr.append(acc_tr_lambda)
        acc_te.append(acc_te_lambda)
        f1_tr.append(f1_tr_lambda)
        f1_te.append(f1_te_lambda)
       
    cross_validation_visualization(lambdas, acc_tr, acc_te, f1_tr, f1_te)



## 4) Data preprocessing

In [51]:
y_train, x_train, ids_train = data_preprocessing(y_train, x_train, ids_train,"discard")
_, x_test, ids_test = data_preprocessing(_, x_test, ids_test,"zero")

print("The dimensions of x_train are ",x_train.shape)
print("The dimension of y_train is ",y_train.shape)
print("The dimension of ids_train is ",ids_train.shape, "\n")
print("The dimensions of x_test are ",x_test.shape)
print("The dimension of ids_test is ",ids_test.shape)


The dimensions of x_train are  (68114, 30)
The dimension of y_train is  (68114,)
The dimension of ids_train is  (68114,) 

The dimensions of x_test are  (568238, 30)
The dimension of ids_test is  (568238,)


## 5) Model tuning

In [57]:
degrees = [5,6,7,8,9,10]
k_fold = 4
lambdas = np.logspace(-15, 0, 15)

for degree in degrees:
    cross_validation_demo(y_train, x_train, k_fold, lambdas, degree, 657)

The train data accuracy of the model is  0.8006401127538062 
The train data f1 score of the model is  0.7874595000704346
The train data accuracy of the model is  0.8006254312686272 
The train data f1 score of the model is  0.7874538283353157
The train data accuracy of the model is  0.8006694757241642 
The train data f1 score of the model is  0.7874908043638184
The train data accuracy of the model is  0.8006694757241642 
The train data f1 score of the model is  0.7874841517053547
The train data accuracy of the model is  0.8006401127538062 
The train data f1 score of the model is  0.7874595000704346
The train data accuracy of the model is  0.8006254312686272 
The train data f1 score of the model is  0.7874538283353157
The train data accuracy of the model is  0.8006694757241642 
The train data f1 score of the model is  0.7874908043638184
The train data accuracy of the model is  0.8006694757241642 
The train data f1 score of the model is  0.7874841517053547
The train data accuracy of the m

KeyboardInterrupt: 

## 6) Prediction of the test data labels

In [53]:
degree=9
minusloglambda=2
lambda_=10**(-loglambda)
y_tr_pd, y_te_pd = prediction(x_train, y_train, x_test, degree, lambda_)
name="submission_{0}_{1}.csv".format(degree,minusloglambda)
create_csv_submission(ids_test, y_te_pd, name)

The train data accuracy of the model is  0.8246322341956133 
The train data f1 score of the model is  0.8158142260188426
