In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [6]:
import pandas as pd

train_DF = pd.read_csv(DATA_TRAIN_PATH)
train_DF.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,100002,b,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,100003,b,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,...,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,100004,b,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,...,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


In [5]:
from helpers import *
DATA_TRAIN_PATH = "../train.csv" # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [7]:
with open(DATA_TRAIN_PATH, "r") as f:
            title_list = f.readline().split(",")

jet_num_col = title_list.index("PRI_jet_num")

category_tx = np.genfromtxt(
    DATA_TRAIN_PATH, dtype=str, delimiter=",", skip_header=1, usecols=[jet_num_col])
category_tx


array(['2', '1', '1', ..., '1', '0', '0'], dtype='<U1')

In [13]:
class Dataset:
    def __init__(self, data_pth, data_type):
        self.data_pth = f"{data_pth}/{data_type}.csv"
        self.data_type = data_type

        self.data = []
        self.labels = []
        self.ids = []

        self.col_names = self.read_col_names()
        self.num_cols = len(self.col_names)

    def load_data(self):
        """Load the data from the csv file."""

        y, tX, ids = load_csv_data(self.data_pth)
        self.ids = ids
        self.labels = y
        self.data = tX

        self.data_imputation()
        self.data_normalization()
        self.filter_outliers()

    def read_col_names(self):
        """Read the column names from the csv file."""

        with open(self.data_pth, "r") as f:
            col_names = f.readline().strip().split(",")
        return col_names

    def data_imputation(self, method="median"):
        """Impute the missing values in the data."""

        for col in range(self.data.shape[1]):
            col_data = self.data[:, col]
            if method == "mean":
                col_data[col_data == -
                         999.0] = np.nanmean(col_data[col_data != -999.0])
                col_data[np.isnan(col_data)] = np.nanmean(col_data)
            elif method == "median":
                col_data[col_data == -
                         999.0] = np.nanmedian(col_data[col_data != -999.0])
                col_data[np.isnan(col_data)] = np.nanmedian(col_data)
            else:
                col_data[col_data == -999.0] = 0.0
                col_data[np.isnan(col_data)] = 0.0

    def data_normalization(self):
        """Normalize the data, zero-mean and standardization."""

        mean_data = np.mean(self.data, axis=0)
        self.data = self.data - mean_data
        self.data = self.data / np.std(self.data, axis=0)

    def category_feature(self):
        """Create new features based on the category feature."""
        jet_num_one_hot = {
            "0": [0.0, 0.0, 0.0, 1.0],
            "1": [0.0, 0.0, 1.0, 0.0],
            "2": [0.0, 1.0, 0.0, 0.0],
            "3": [1.0, 0.0, 0.0, 0.0]
        }

        jet_num_col = self.col_names.index("PRI_jet_num")

    def filter_outliers(self, m=10):
        """
        Filter out outliers over mean +/- m * std>
        """
        for i in range(self.data.shape[1]):
            delta = abs(self.data[:, i] - np.mean(self.data[:, i]))
            mdev = m * np.std(self.data[:, i])
            self.data = self.data[delta < mdev]
            self.labels = self.labels[delta < mdev]
            self.ids = self.ids[delta < mdev]

            assert self.labels.shape[0] == self.data.shape[0]
            assert self.ids.shape[0] == self.data.shape[0]


In [14]:
train_dataset = Dataset("../", "train")
train_dataset.load_data()

In [15]:
train_dataset

<bound method Dataset.category_feature of <__main__.Dataset object at 0x299bbf880>>

In [45]:
id_col = train_dataset.col_names.index("Id")
label_col = train_dataset.col_names.index("Prediction")
jet_num_col = train_dataset.col_names.index("PRI_jet_num")
special_col = [id_col, label_col, jet_num_col]
float_col_ids = []
float_col_names = []
category_col_names = ['jetnum3', 'jetnum2', 'jetnum1', 'jetnum0']
for idx in range(len(title_list)):
    if idx not in special_col:
        float_col_ids.append(idx)
        float_col_names.append(title_list[idx])
        full_col_names = ['x_bias'] + float_col_names + category_col_names
len(full_col_names)

34

### Normalize Data

In [16]:
def normalize(x):
    """Normalize the dataset x."""
    mean_x = np.mean(x, axis=0)
    std_x = np.std(x, axis=0)
    x = (x - mean_x) / std_x
    return x

tX_norm = normalize(tX)

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [4]:
DATA_TEST_PATH = "../data/test.csv" # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)