In [152]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from helpers import *
from implementations import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [153]:
import csv


def load_data():
    # Opening files
    with open("resources/train.csv", newline="") as csvfile:
        train_data = np.array(list(csv.reader(csvfile)))
    with open("resources/test.csv", newline="") as csvfile:
        test_data = np.array(list(csv.reader(csvfile)))
    return train_data, test_data

In [154]:
train_data, test_data = load_data()
print("Finished loading data")

Finished loading data


In [155]:
def data_preprocessing(data):
    # Labels and removing headers
    ids = data[1:, 0]
    labels = data[1:, 1]
    signal = np.array(data[1:, 2:], dtype=float)
    signal = np.c_[signal, ids]
    signal = np.array(signal, dtype=float)
    labels = np.array(
        list(map(lambda x: 0 if x == "s" else 1, labels))
    )

    # Separate data into 4 samples based on PRI_jet_num values (col 22)
    sets = [[] for i in range(4)]
    set_labels = [[] for i in range(4)]
    for i in range(4):
        sets[i] = signal[signal[:, 22] == i]
        set_labels[i] = labels[signal[:, 22] == i]
        l = len(sets[i])

        # Remove features which are entirely -999 or with an unique value in the column
        noisy_feats = []
        for col in range(signal.shape[1]):
            proportion = np.count_nonzero(sets[i][:, col] == -999) / l
            deviation = np.std(sets[i][:, col])
            if proportion == 1 or deviation == 0:
                noisy_feats += [col]
        sets[i] = np.delete(sets[i], noisy_feats, axis=1)

    # Set remaining -999 values to 0
    for i in range(4):
        m = np.median(sets[i][sets[i] != -999])
        sets[i][sets[i] == -999] = m

    id_sets = [x[:, -1] for x in sets]
    for i in range(len(sets)):
        sets[i] = np.delete(sets[i], -1, 1)

    # Standardizing data
    sets = [standardize(s) for s in sets]
    return sets, set_labels, id_sets


In [156]:
training_sets, training_labels, _ = data_preprocessing(train_data)
test_sets, test_labels, test_ids = data_preprocessing(test_data)

print("Finished preprocessing")

Finished preprocessing


In [159]:
seed = 4
max_degree = 5
k_fold_sets = 4

ws = []
degrees = []

# Train and cross validate
for i in range(4):
    print("For jet value : %d" % i)
    deg, _, _, w = cross_validation(ridge_regression, training_labels[i], training_sets[i], k_fold_sets,
                                 max_degree + 1, seed, np.linspace(0.0, 0.5, num=30)[1:])
    ws.append(w)
    degrees.append(deg)

For jet value : 0
For polynomial expansion up to degree 5, best degree : 5, best lambda : 0.01724, accuracy : 0.832272
For jet value : 1
For polynomial expansion up to degree 5, best degree : 5, best lambda : 0.01724, accuracy : 0.776720
For jet value : 2
For polynomial expansion up to degree 5, best degree : 5, best lambda : 0.01724, accuracy : 0.798356
For jet value : 3
For polynomial expansion up to degree 5, best degree : 5, best lambda : 0.01724, accuracy : 0.789839


In [81]:
lambdas = [0.008, 0.017, 0.008, 0.008]  # Obtained with cross validation
degrees = [8, 8, 9, 9]
ws = []

for i in range(4):
    X_expanded = polynomial_expansion(training_sets[i], degrees[i])
    w, _ = ridge_regression(training_labels[i], X_expanded, lambdas[i])
    accuracy = accuracy_score(training_labels[i], X_expanded, w)
    ws.append(w)
    print("For jet value : %d, produced accuracy of : %f" % (i, accuracy))


For jet value : 0, produced accuracy of : 0.840111
For jet value : 1, produced accuracy of : 0.791680
For jet value : 2, produced accuracy of : 0.826594
For jet value : 3, produced accuracy of : 0.821783
