In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *

In [3]:
DATA_TRAIN_PATH = "data/train.csv" # download train data and supply path
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Data pre-processing

We split our training set into 4 different groups depending on the value of the `PRI_jet_num` parameter.

In [4]:
from pre_processing import *

In [5]:
jet_num = 0
y0, tX0, ids0 = jet_num_split(jet_num, y, tX, ids)

jet_num = 1
y1, tX1, ids1 = jet_num_split(jet_num, y, tX, ids)

jet_num = 2
y2, tX2, ids2 = jet_num_split(jet_num, y, tX, ids)

jet_num = 3
y3, tX3, ids3 = jet_num_split(jet_num, y, tX, ids)

# Finding the best degrees for the polynomial feature expansion

In [6]:
from cross_validation import *
from implementation import *

We test the polynomial feature expansion up to `max_degree` using  `k_fold` cross-validation.

In [7]:
max_degree = 3
k_fold = 4

### Reduced training set for `PRI_jet_num=0`

Cross-validation for degree optimisation.

In [8]:
expansion_degrees0, tX0_expanded = degree_cross_validation(y0, tX0, k_fold, max_degree, None)

TR 0.5439418893456567 TE 0.13598547233641417 DEG 1
TR 0.7002540516395086 TE 0.17506351290987715 DEG 2
TR 0.9999715104838616 TE 0.2499928776209654 DEG 3


Machine learning with feature expansion.

In [9]:
w0, loss0 = least_squares(y0, tX0_expanded)

### Reduced training set for `PRI_jet_num=1`

Cross-validation for degree optimisation.

In [10]:
expansion_degrees1, tX1_expanded = degree_cross_validation(y1, tX1, 4, max_degree, None)

TR 0.7224975464428748 TE 0.1806243866107187 DEG 1
TR 0.6818464229734459 TE 0.17046160574336147 DEG 2
TR 0.7972286811401599 TE 0.19930717028503997 DEG 3


Machine learning with feature expansion.

In [11]:
w1, loss1 = least_squares(y1, tX1_expanded)

### Reduced training set for `PRI_jet_num=2`

Cross-validation for degree optimisation.

In [12]:
expansion_degrees2, tX2_expanded = degree_cross_validation(y2, tX2, 4, max_degree, None)

TR 0.7000639175036556 TE 0.1750159793759139 DEG 1
TR 0.7849021872484996 TE 0.1962255468121249 DEG 2
TR 0.999936731781592 TE 0.249984182945398 DEG 3


Machine learning with feature expansion.

In [13]:
w2, loss2 = least_squares(y2, tX2_expanded)

### Reduced training set for `PRI_jet_num=3`

Cross-validation for degree optimisation.

In [14]:
expansion_degrees3, tX3_expanded = degree_cross_validation(y3, tX3, 4, max_degree, None)

TR 0.7017729049125367 TE 0.17544322622813419 DEG 1
TR 0.7696390312907369 TE 0.1924097578226842 DEG 2
TR 0.9998968592065258 TE 0.24997421480163146 DEG 3


Machine learning with feature expansion.

In [15]:
w3, loss3 = least_squares(y3, tX3_expanded)

# Generate predictions

In [16]:
DATA_TEST_PATH = "data/test.csv" #download test data and supply path
y_pred, tX_pred, ids_pred = load_csv_data(DATA_TEST_PATH)

### Pre-processing and feature expansion

In [17]:
jet_num = 0
y0_pred, tX0_pred, ids0_pred = jet_num_split(jet_num, y_pred, tX_pred, ids_pred)

jet_num = 1
y1_pred, tX1_pred, ids1_pred = jet_num_split(jet_num, y_pred, tX_pred, ids_pred)

jet_num = 2
y2_pred, tX2_pred, ids2_pred = jet_num_split(jet_num, y_pred, tX_pred, ids_pred)

jet_num = 3
y3_pred, tX3_pred, ids3_pred = jet_num_split(jet_num, y_pred, tX_pred, ids_pred)

In [18]:
# PRI_jet_num = 0
tX0_pred_expanded = np.c_[tX0_pred, np.ones(tX0_pred.shape[0])]
for feature_index in range(expansion_degrees0.shape[0]-1, -1, -1):
    tX0_pred_expanded = build_poly_index(tX0_pred_expanded, expansion_degrees0[feature_index], feature_index)


# PRI_jet_num = 1
tX1_pred_expanded = np.c_[tX1_pred, np.ones(tX1_pred.shape[0])]
for feature_index in range(expansion_degrees1.shape[0]-1, -1, -1):
    tX1_pred_expanded = build_poly_index(tX1_pred_expanded, expansion_degrees1[feature_index], feature_index)


# PRI_jet_num = 2
tX2_pred_expanded = np.c_[tX2_pred, np.ones(tX2_pred.shape[0])]
for feature_index in range(expansion_degrees2.shape[0]-1, -1, -1):
    tX2_pred_expanded = build_poly_index(tX2_pred_expanded, expansion_degrees2[feature_index], feature_index)


# PRI_jet_num = 3
tX3_pred_expanded = np.c_[tX3_pred, np.ones(tX3_pred.shape[0])]
for feature_index in range(expansion_degrees3.shape[0]-1, -1, -1):
    tX3_pred_expanded = build_poly_index(tX3_pred_expanded, expansion_degrees3[feature_index], feature_index)

### Predictions and submission file

In [19]:
OUTPUT_PATH = 'results.csv' #name of output file for submission

y0_pred = predict_labels(w0, tX0_pred_expanded)
y1_pred = predict_labels(w1, tX1_pred_expanded)
y2_pred = predict_labels(w2, tX2_pred_expanded)
y3_pred = predict_labels(w3, tX3_pred_expanded)

y_pred_sep = np.concatenate([y0_pred, y1_pred, y2_pred, y3_pred])
ids_te_sep = np.concatenate([ids0_pred, ids1_pred, ids2_pred, ids3_pred])

create_csv_submission(ids_te_sep, y_pred_sep, OUTPUT_PATH)