# Higgs Boson - ML Challenge

In [30]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from implementation import *
from model_helpers import *
from data_helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the dataset

In [31]:
paths = { 
    'train' : 'data/train.csv',
     'test' : 'data/test.csv',
     'submission' : 'data/sample-submission.csv'
        }
N = 4

y_tr, tx_tr, ids_tr = load_csv_data(paths['train'], sub_sample=False)
y_te, tx_te, ids_te = load_csv_data(paths['test'])

In [32]:
len_test = len(y_te)

y_tr = y_tr[:, np.newaxis]
y_pred = np.zeros(len_test)

## Divide into 4 different subsets depending on jet value

In [33]:
mask_tr = get_mask(tx_tr)
mask_te = get_mask(tx_te)

x_tr_subsamples = []
y_tr_subsamples = []

x_te_subsamples = []

for i in range(N):
    x_tr_subsamples.append(tx_tr[mask_tr[i]])
    y_tr_subsamples.append(y_tr[mask_tr[i]])
    x_te_subsamples.append(tx_te[mask_te[i]])

## Process the dataset

In [34]:
for j in range(N):
    x_tr_subsamples[j], x_te_subsamples[j] = pre_processing(x_tr_subsamples[j], x_te_subsamples[j], j)

## Cross Validation

In [41]:
def cross_validation(y, x, k_indices, k, lambda_, degree, gamma, function, max_iters=500):
    """
    Return the loss of ridge regression for a fold corresponding to k_indices
    
    Args:
        y:          shape=(N, 1)
        x:          shape=(N, D)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold
        lambda_:    scalar, used by ridge regression
        degree:     scalar, used by build poly
        gamma:      scalar, stepsize

    Returns:
        test loss: probability of predicting correct values
    """
    
    train_id = np.delete(k_indices, k, axis=0).ravel()
    test_id = k_indices[k]
    
    x_tr, y_tr = x[train_id], y[train_id]
    x_te, y_te = x[test_id], y[test_id]
    
    x_tr, x_te = build_poly(x_tr, degree), build_poly(x_te, degree)
    
    initial_w = np.zeros((x_tr.shape[1], 1))
    
    
    if function == 'RidgeRegression':
        
        w, _ = ridge_regression(y_tr, x_tr, lambda_)
    
    elif function == 'LeastSquares':
        
        w, _ = least_squares(y_tr, x_tr)
        
    elif function == 'LogisticRegression':
        
        w, _ = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)
        
    elif function == 'RegLogisticRegression':
        
        w, _ = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma)
    
    
    return (y_te == predict_labels(x_te, w)).mean()

In [42]:
def cross_validation_grid_search(txs, ys, func):
    """
    Runs cross validation on the data with different values of hyperparameters to compare accuracy
    
    Args:
        txs: subsets of train dataset
        ys:  labels of the different subsets
        func: string, types of function
        
    """
    # Hyperparameters
    
    seed = 51
    k_fold = 4
    
    # Lambda: regularization parameter
    lambdas = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
    
    # Degree: feature augmentation
    degrees = range(2, 10, 1)
    
    # Gamma: stepsize
    gammas = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    
    # split data in k fold
    k_indices = []
    for i in range(len(txs)):
        k_indices.append(build_k_indices(ys[i].shape[0], k_fold, seed))
        
    print(f"Function: {func}")
        
    best_tuple = [(-1, -1, -1, -1)]*4
    
    # cross validation
    for i in range(len(txs)):
        
        max_acc = 0
        print(f"->Subset {i}:")
        
        for l in lambdas:
            for d in degrees:
                for g in gammas:
                    
                    pred_pcts = []
                    
                    for k in range(k_fold):
                        pred_pct = cross_validation(ys[i], txs[i], k_indices[i], k, l, d, g, func)
                        pred_pcts.append(pred_pct)
                        
                    pct = np.mean(pred_pcts)
                    if pct > max_acc:
                        max_acc = pct
                        best_tuple[i] = (l,d,g,pct)
                        print(f"- Set {i}/lamdba={l}/deg={d}/gamma={g}/ACC={np.around(pct, 3)}")
                        
    return best_tuple

In [43]:
functions = ['RidgeRegression', 'LeastSquares', 'RegLogisticRegression']

best_tuple = {}

for func in functions[:-2]:
    best_tuple[func] = cross_validation_grid_search(x_tr_subsamples,y_tr_subsamples,func)

Function: RidgeRegression
->Subset 0:
- Set 0/lamdba=1e-08/deg=2/gamma=1e-06/ACC=0.836
- Set 0/lamdba=1e-08/deg=3/gamma=1e-06/ACC=0.841
- Set 0/lamdba=1e-08/deg=4/gamma=1e-06/ACC=0.841
- Set 0/lamdba=1e-08/deg=5/gamma=1e-06/ACC=0.843
- Set 0/lamdba=1e-08/deg=6/gamma=1e-06/ACC=0.843
- Set 0/lamdba=1e-08/deg=7/gamma=1e-06/ACC=0.844
- Set 0/lamdba=1e-08/deg=8/gamma=1e-06/ACC=0.844
- Set 0/lamdba=1e-06/deg=9/gamma=1e-06/ACC=0.844
- Set 0/lamdba=1e-05/deg=9/gamma=1e-06/ACC=0.844
->Subset 1:
- Set 1/lamdba=1e-08/deg=2/gamma=1e-06/ACC=0.784
- Set 1/lamdba=1e-08/deg=3/gamma=1e-06/ACC=0.803
- Set 1/lamdba=1e-08/deg=4/gamma=1e-06/ACC=0.805
- Set 1/lamdba=1e-08/deg=5/gamma=1e-06/ACC=0.807
- Set 1/lamdba=1e-08/deg=6/gamma=1e-06/ACC=0.807
- Set 1/lamdba=1e-08/deg=7/gamma=1e-06/ACC=0.808
- Set 1/lamdba=1e-08/deg=8/gamma=1e-06/ACC=0.808
- Set 1/lamdba=0.0001/deg=9/gamma=1e-06/ACC=0.808
->Subset 2:
- Set 2/lamdba=1e-08/deg=2/gamma=1e-06/ACC=0.819
- Set 2/lamdba=1e-08/deg=3/gamma=1e-06/ACC=0.832
- Set 

In [44]:
best_tuple['RidgeRegression']

[(1e-05, 9, 1e-06, 0.8436824405476819),
 (0.0001, 9, 1e-06, 0.8078123388011967),
 (1e-05, 9, 1e-06, 0.835159599809433),
 (0.001, 6, 1e-06, 0.8399205919509115)]

## Train the model with the best parameters

In [45]:
def train_model(txs, ys, params):
    """Trains the classifier model
    
    Args:
        txs: training data split into three subsets
        y: labels of training data split into three subsets
    
    Returns:
        ws: weights of each subsets.
    """
    
    ws = []
    
    for i in range(len(txs)):
        
        lambda_, degree, gamma, _ = params[i]
        x_poly = build_poly(txs[i], degree)
        initial_w = np.zeros((x_poly.shape[1], 1))
        
        ws.append(ridge_regression(ys[i], x_poly, lambda_=lambda_)[0])
        
    return ws

In [46]:
chosen_function = 'RidgeRegression'

ws = train_model(x_tr_subsamples, y_tr_subsamples, best_tuple[chosen_function])

## Generate submission

In [47]:
def generate_predictions(txs_te, ws, mask_test, y_pred, params):
    """Generate the predictions and save ouput
    
    Args:
        txs_te: subsets of test dataset
        ws: weights of the different subsets
    """
    
    for j in range(len(txs_te)):
        degree = params[j][1]
        y_pred[mask_test[j]] = [y[0] for y in predict_labels(build_poly(txs_te[j],degree), ws[j])]
            
    create_csv_submission(ids_te, y_pred, paths['submission'])

In [48]:
generate_predictions(x_te_subsamples, ws, mask_te, y_pred, best_tuple[chosen_function])