# Higgs Boson - ML Challenge

In [45]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from implementation import *
from model_helpers import *
from data_helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the dataset

In [46]:
paths = { 
    'train' : 'data/train.csv',
     'test' : 'data/test.csv',
     'submission' : 'data/sample-submission.csv'
        }

y_tr, tx_tr, ids_tr = load_csv_data(paths['train'], sub_sample=False)
y_te, tx_te, ids_te = load_csv_data(paths['test'])

In [47]:
len_test = len(y_te)

y_tr = y_tr[:, np.newaxis]
y_pred = np.zeros(len_test)

## Divide into 4 different subsets depending on jet value

In [48]:
mask_tr = get_mask(tx_tr)
mask_te = get_mask(tx_te)

x_tr_subsamples = []
y_tr_subsamples = []

x_te_subsamples = []

# create subsamples
for i in range(4):
    x_tr_subsamples.append(tx_tr[mask_tr[i]])
    y_tr_subsamples.append(y_tr[mask_tr[i]])
    x_te_subsamples.append(tx_te[mask_te[i]])

## Process the dataset

In [49]:
# pre-process each subsample
for j in range(4):
    x_tr_subsamples[j] = pre_processing(x_tr_subsamples[j], j)
    x_te_subsamples[j] = pre_processing(x_te_subsamples[j], j)

## Cross Validation

In [6]:
def cross_validation(y, x, k_indices, k, lambda_, degree, gamma, function='',max_iters=1000):
    """
    Return the loss of ridge regression for a fold corresponding to k_indices
    
    Args:
        y:          shape=(N, 1)
        x:          shape=(N, D)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold
        lambda_:    scalar, used by ridge regression
        degree:     scalar, used by build poly
        gamma:      scalar, stepsize

    Returns:
        test loss: probability of predicting correct values
    """
    
    train_id = np.delete(k_indices, k, axis=0).ravel()
    test_id = k_indices[k]
    
    x_tr, y_tr = x[train_id], y[train_id]
    x_te, y_te = x[test_id], y[test_id]
    
    x_tr, x_te = build_poly(x_tr, degree), build_poly(x_te, degree)
    
    initial_w = np.zeros((x_tr.shape[1], 1))
    
    if function == 'LinearRegression':
        
        raise NotImplemented
    
    elif function == 'RidgeRegression':
        
        w, _ = ridge_regression(y_tr, x_tr, lambda_)
    
    elif function == 'LeastSquares':
        
        w, _ = least_squares(y_tr, x_tr)
        
    elif function == 'LogisticRegression':
        
        w, _ = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)
        
    elif function == 'RegLogisticRegression':
        
        w, _ = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma)
    
    return (y_te == predict_labels(x_te, w)).mean()

In [9]:
def cross_validation_grid_search(txs, ys, func):
    """Runs cross validation on the data with different values of hyperparameters to compare accuracy
    
    Args:
        txs: subsets of train dataset
        ys:  labels of the different subsets
        func: string, types of function
        
    """
    # Hyperparameters
    
    seed = 51
    k_fold = 4
    
    # Lambda: regularization parameter
    lambdas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    
    # Degree: feature augmentation
    degrees = range(2, 8, 2)
    
    # Gamma: stepsize
    gammas = [1e-4, 1e-3, 1e-2, 1e-1]
    
    # split data in k fold
    k_indices = []
    for i in range(len(txs)):
        k_indices.append(build_k_indices(ys[i].shape[0], k_fold, seed))
        
    print("Function: " + func)
        
    # cross validation
    for i in range(len(txs)):
        
        max_acc = 0
        print(f"*Set {i}")
        
        for l in lambdas:
            for d in degrees:
                for g in gammas:
                    
                    pred_pcts = []
                    
                    for k in range(k_fold):
                        pred_pct = cross_validation(ys[i], txs[i], k_indices[i], k, l, d, g, func)
                        pred_pcts.append(pred_pct)
                        
                    pct = np.mean(pred_pcts)
                    if pct > max_acc:
                        max_acc = pct
                        print(f">>>>Set {i}/lamdba={l}/deg={d}/gamma={g}/ACC={np.around(pct, 3)}")

In [10]:
functions = ['LinearRegression', 'RidgeRegression', 'LeastSquares', \
             'LogisticRegression', 'RegLogisticRegression']

for func in functions[1:]:
    cross_validation_grid_search(x_tr_subsamples,y_tr_subsamples,func)

Function: RidgeRegression
*Set 0
>>>>Set 0/lamdba=1e-05/deg=2/gamma=0.0001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=2/gamma=0.001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=2/gamma=0.01/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=2/gamma=0.1/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=4/gamma=0.0001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=4/gamma=0.001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=4/gamma=0.01/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=4/gamma=0.1/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=6/gamma=0.0001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=6/gamma=0.001/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=6/gamma=0.01/ACC=0.0
>>>>Set 0/lamdba=1e-05/deg=6/gamma=0.1/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=2/gamma=0.0001/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=2/gamma=0.001/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=2/gamma=0.01/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=2/gamma=0.1/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=4/gamma=0.0001/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=4/gamma=0.001/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=4/gamma=0.01/ACC=0.0
>>>>Set 0/lamdba=0.0001/deg=4/gamma=0

>>>>Set 2/lamdba=0.1/deg=2/gamma=0.0001/ACC=0.674
>>>>Set 2/lamdba=0.1/deg=2/gamma=0.001/ACC=0.674
>>>>Set 2/lamdba=0.1/deg=2/gamma=0.01/ACC=0.674
>>>>Set 2/lamdba=0.1/deg=2/gamma=0.1/ACC=0.674
>>>>Set 2/lamdba=0.1/deg=4/gamma=0.0001/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=4/gamma=0.001/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=4/gamma=0.01/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=4/gamma=0.1/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=6/gamma=0.0001/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=6/gamma=0.001/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=6/gamma=0.01/ACC=0.677
>>>>Set 2/lamdba=0.1/deg=6/gamma=0.1/ACC=0.677
*Set 3
>>>>Set 3/lamdba=1e-05/deg=2/gamma=0.0001/ACC=0.747
>>>>Set 3/lamdba=1e-05/deg=2/gamma=0.001/ACC=0.747
>>>>Set 3/lamdba=1e-05/deg=2/gamma=0.01/ACC=0.747
>>>>Set 3/lamdba=1e-05/deg=2/gamma=0.1/ACC=0.747
>>>>Set 3/lamdba=1e-05/deg=4/gamma=0.0001/ACC=0.751
>>>>Set 3/lamdba=1e-05/deg=4/gamma=0.001/ACC=0.751
>>>>Set 3/lamdba=1e-05/deg=4/gamma=0.01/ACC=0.751
>>>>Set 3/lamdba=1e-05/deg=4/gamma=0.1/ACC=0.751
>>>>Set

>>>>Set 1/lamdba=0.001/deg=6/gamma=0.1/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=2/gamma=0.0001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=2/gamma=0.001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=2/gamma=0.01/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=2/gamma=0.1/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=4/gamma=0.0001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=4/gamma=0.001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=4/gamma=0.01/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=4/gamma=0.1/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=6/gamma=0.0001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=6/gamma=0.001/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=6/gamma=0.01/ACC=0.0
>>>>Set 1/lamdba=0.01/deg=6/gamma=0.1/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=2/gamma=0.0001/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=2/gamma=0.001/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=2/gamma=0.01/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=2/gamma=0.1/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=4/gamma=0.0001/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=4/gamma=0.001/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=4/gamma=0.01/ACC=0.0
>>>>Set 1/lamdba=0.1/deg=4/gamma=0.1/ACC=0.0
>>>>Set 1/l

KeyboardInterrupt: 

## Train the model with the best parameters

In [None]:
def train_model(txs, ys, lambda_, gamma, degree):
    """Trains the classifier model
    
    Args:
        txs: training data split into three subsets
        y: labels of training data split into three subsets
    
    Returns:
        ws: weights of each subsets.
    """
    
    ws = []
    
    for i in range(len(txs)):
        x_poly = build_poly(txs[i],degree)
        weights, loss = reg_logistic_regression(ys[i], x_poly, lambda_=lambda_, initial_w=np.zeros((x_poly.shape[1], 1)), max_iters=1000, gamma=gamma)
        ws.append(weights)
        
    return ws

In [None]:
lambda_, gamma, degree = 1e-5, 1e-2, 4

ws = train_model(x_tr_subsamples, y_tr_subsamples, lambda_, gamma, degree)

## Generate submission

In [None]:
def generate_predictions(txs_te, ws, mask_test, y_pred):
    """Generate the predictions and save ouput
    
    Args:
        txs_te: subsets of test dataset
        ws: weights of the different subsets
        
    """
    
    for j in range(len(txs_te)):
            y_pred[mask_test[j]] = [y[0] for y in predict_labels(build_poly(txs_te[j],degree), ws[j])]
            
    create_csv_submission(ids_te, y_pred, paths['submission'])

In [None]:
generate_predictions(x_te_subsamples, ws, mask_te, y_pred)