In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
import os
data_base_path = os.path.join(os.pardir, 'data')
data_folder = 'train.csv'
data_path = os.path.join(data_base_path, data_folder)
y, tX, ids = load_csv_data(data_path)

### Change later : Simple data description 
- all variables are floating point, except PRI_jet_num which is integer
- variables prefixed with PRI (for PRImitives) are “raw” quantities about the bunch collision as measured by the detector.
- variables prefixed with DER (for DERived) are quantities computed from the primitive features, which were selected by the physicists of ATLAS.
- it can happen that for some entries some variables are meaningless or cannot be computed; in this case, their value is −999.0, which is outside the normal range of all variables.

In [3]:
print('y shape = ' + str(y.shape) + '\ntX shape =' + str(tX.shape) + '\nids shape = ' + str(ids.shape))

y shape = (250000,)
tX shape =(250000, 30)
ids shape = (250000,)


A faire : Dataprocessing: 
- Truc qui évalue les NA
- Fonction qui vire ou non un featues basé sur un seul
- Remplace les NA par la moyenne

### Data pre-processing

We will first transform the categorical variable (PRI_jet_num) into four dummy variables

In [20]:
def separate_factor(x, nlevels=4, column_idx = 22):
    new_var = np.zeros((x.shape[0], nlevels))
    
    for i in range(x.shape[0]):
        for j in range(nlevels):
            if x[i,column_idx] == j:
                new_var[j,0] = 1
                
    x = np.delete(x, column_idx, axis = 1)
    
    return x, new_var

def missingness_filter(cX, cutoff = 0.5):
    
    cX = np.where(cX == -999, np.nan, cX)
    missingness = np.sum(np.isnan(cX), axis = 0)/cX.shape[0]
    
    to_remove = np.where(missingness > cutoff)[0]
    
    return np.delete(cX, to_remove, axis = 1), to_remove


In [14]:
cX, fac_X = separate_factor(tX)
print(cX.shape, fac_X.shape)

filt_cX = missingness_filter(cX, 0.6)
print(filt_cX.shape)

(250000, 29) (250000, 4)
(250000, 22)


In [5]:
def build_poly(x, degree):
    """ Lolynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [None]:
arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(arr, arr.shape)

In [None]:
build_poly(arr,2)

In [6]:
def build_lin_combination(x):
    
    comb = np.ones((x.shape[0],)).reshape(-1,1)
    
    for i in range(x.shape[1]-1):
        
        for j in range(i+1,x.shape[1]):
            
            temp = x[:,i] * x[:,j]
            
            comb = np.concatenate((comb, temp.reshape(-1,1)), axis=1)
            
    return np.delete(comb, 0, axis = 1)

In [None]:
lin_comb = build_lin_combination(cX)
print(lin_comb.shape)

In [None]:
arr = np.array([[1,2,np.nan,4], [5,np.nan,7,8], [np.nan,10,11,np.nan]])
print(arr, arr.shape)

In [7]:
def impute_mean(x):
    out = np.zeros(x.shape)
    for i in range(x.shape[1]):
        temp = x[:,i]
        mean = np.nanmean(temp)
        out[:,i] = np.nan_to_num(temp, nan = mean)
    
    return out

def impute_median(x):
    out = np.zeros(x.shape)
    for i in range(x.shape[1]):
        temp = x[:,i]
        median = np.nanmedian(temp)
        out[:,i] = np.nan_to_num(temp, nan = median)
    
    return out

def impute_gaussian(x):
    out = np.zeros(x.shape)
    for i in range(x.shape[1]):
        temp = x[:,i]
        mean = np.nanmean(temp)
        std = np.nanstd(temp)
        
        for j in range(x.shape[0]):
            out[j,i] = np.nan_to_num(temp[j], nan = np.random.normal(loc=mean, scale=std))
    
    return out

In [28]:
def train_data_formatting(tX, degree = 2, cutoff = 0.6, imputation = impute_mean, lin_comb = False):
    
    #separating out the categorical variables
    cont_X, fac_X = separate_factor(tX)
    
    #applying a missingness filter on the columns/features
    cont_X, to_remove = missingness_filter(cont_X, cutoff)
    
    #imputing the missing data
    cont_X = imputation(cont_X)
    
    poly = build_poly(cont_X, degree)
    
    poly = np.concatenate((poly, fac_X), axis=1)
    
    if lin_comb:
        lin = build_lin_combination(cont_X)
        
        return np.concatenate((poly, lin), axis=1), to_remove
    
    return poly, to_remove

--------------

In [None]:
# putting the meaningless values to the median of the column
tX = np.where(tX==-999., np.nan,tX)
med_X = np.nanmedian(tX, axis=0)

inds = np.where(np.isnan(tX))
tX[inds] = np.take(med_X, inds[1])

In [None]:
print(np.where(tX[0] == 2))

In [None]:
# remove categorical data and standarize the rest
ntX = np.delete(tX, 22, axis=1)
ntX = np.apply_along_axis(standardize, 1, ntX)

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)