In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *

In [3]:
DATA_TRAIN_PATH = "data/train.csv" # download train data and supply path
y_tr, tX_tr, ids_tr = load_csv_data(DATA_TRAIN_PATH)

In [4]:
DATA_TEST_PATH = "data/test.csv" #download test data and supply path
y_fin, tX_fin, ids_fin = load_csv_data(DATA_TEST_PATH)

# Data pre-processing

In [8]:
from data_analysis import *

Split the data into train (80%) and test (20%) parts

In [9]:
ratio = 0.8;
tX_tr, tX_te, y_tr, y_te, ids_tr, ids_te = split_data(tX_tr, y_tr, ids_tr, ratio)

Perform the following operations:
1. Split every dataset into 4 different groups depending on the value of the `PRI_jet_num` parameter in column 22. Some columns will be completely undetermined for some of the groups after this split: delete the undetermined columns (example: if `PRI_jet_num=0`, `DER_deltaeta_jet_jet` (column 4) will always be undetermined)
2. Replace undefined `DER_mass_MMC` values in column 0 by the mean of the defined values in that same column.
3. Standardize every column (subtract the mean and divide by the standard deviation).
4. Delete the training points containing outlier values for some of the features.

In [10]:
jet_num = 0
y0_tr, tX0_tr, ids0_tr, y0_te, tX0_te, ids0_te, y0_fin, tX0_fin, ids0_fin = data_analysis(jet_num, y_tr, tX_tr, ids_tr, y_te, tX_te, ids_te, y_fin, tX_fin, ids_fin)

jet_num = 1
y1_tr, tX1_tr, ids1_tr, y1_te, tX1_te, ids1_te, y1_fin, tX1_fin, ids1_fin = data_analysis(jet_num, y_tr, tX_tr, ids_tr, y_te, tX_te, ids_te, y_fin, tX_fin, ids_fin)

jet_num = 2
y2_tr, tX2_tr, ids2_tr, y2_te, tX2_te, ids2_te, y2_fin, tX2_fin, ids2_fin = data_analysis(jet_num, y_tr, tX_tr, ids_tr, y_te, tX_te, ids_te, y_fin, tX_fin, ids_fin)

jet_num = 3
y3_tr, tX3_tr, ids3_tr, y3_te, tX3_te, ids3_te, y3_fin, tX3_fin, ids3_fin = data_analysis(jet_num, y_tr, tX_tr, ids_tr, y_te, tX_te, ids_te, y_fin, tX_fin, ids_fin)


# Polynomial feature expansion

In [12]:
from implementation import *
from optimization import * 
from proj1_helpers import *
from data_analysis_logistic import *
from cross_validation_logistic import*

For every feature, determine the optimal degree for the polynomial feature expansion by cross-validation on the training dataset.

In [13]:
deg0 = np.ones(tX0_tr.shape[1],np.int64)
deg1 = np.ones(tX1_tr.shape[1],np.int64)
deg2 = np.ones(tX2_tr.shape[1],np.int64)
deg3 = np.ones(tX3_tr.shape[1],np.int64)

for feat_ind in np.arange(len(deg0)):
    deg0[feat_ind] =  cross_validation_degree(y0_tr, tX0_tr, feat_ind, deg0)
    
for feat_ind in np.arange(len(deg1)):
    deg1[feat_ind] =  cross_validation_degree(y1_tr, tX1_tr, feat_ind, deg1)
    
for feat_ind in np.arange(len(deg2)):
    deg2[feat_ind] =  cross_validation_degree(y2_tr, tX2_tr, feat_ind, deg2)
    
for feat_ind in np.arange(len(deg3)):
    deg3[feat_ind] =  cross_validation_degree(y3_tr, tX3_tr, feat_ind, deg3)

Expand all the feature matrices.

In [14]:
tX0_tr,tX0_te,tX0_fin = build_poly_data(tX0_tr,tX0_te,tX0_fin,deg0)
tX1_tr,tX1_te,tX1_fin = build_poly_data(tX1_tr,tX1_te,tX1_fin,deg1)
tX2_tr,tX2_te,tX2_fin = build_poly_data(tX2_tr,tX2_te,tX2_fin,deg2)
tX3_tr,tX3_te,tX3_fin = build_poly_data(tX3_tr,tX3_te,tX3_fin,deg3)

# Machine learning

In [15]:
lambda_ = 0.15

In [16]:
w0, loss0 = ridge_regression(y0_tr, tX0_tr, lambda_)
w1, loss1 = ridge_regression(y1_tr, tX1_tr, lambda_)
w2, loss2 = ridge_regression(y2_tr, tX2_tr, lambda_)
w3, loss3 = ridge_regression(y3_tr, tX3_tr, lambda_)

# Generate predictions

Predict the labels with the 4 different models for every different value of `PRI_jet_num`.

In [17]:
y0_pred = predict_labels(w0, tX0_fin)
y1_pred = predict_labels(w1, tX1_fin)
y2_pred = predict_labels(w2, tX2_fin)
y3_pred = predict_labels(w3, tX3_fin)

Create the submission file.

In [18]:
y_pred = np.concatenate([y0_pred, y1_pred, y2_pred, y3_pred])
ids_pred = np.concatenate([ids0_fin, ids1_fin, ids2_fin, ids3_fin])

OUTPUT_PATH = 'data/results_ridge.csv' #name of output file for submission
create_csv_submission(ids_pred, y_pred, OUTPUT_PATH)

In [19]:
print(ids_pred.shape)

(568238,)


In [20]:
# - Our score : 0.822
# - Categorical accuracy : 0.802
# - F1 score : 0.638