In [31]:
from helpers import load_csv_data
import os
import numpy as np
import matplotlib.pyplot as plt

In [76]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('./data/dataset/dataset', sub_sample = False)

In [344]:
def clean_data(x_train, y_train, lim_nans, lim_corr_features, lim_corr_target):

    # Remove columns with more than lim_nans of Nans
    percentages = np.sum(np.isnan(x_train), axis = 0) / x_train.shape[0]
    x_tr = x_train[:, percentages < lim_nans]

    # Remove datapoints (rows) with one or multiple Nans
    keep_idxs = (np.sum(np.isnan(x_tr), axis = 1)) == 0
    x_tr = x_tr[keep_idxs, :]
    y_tr = y_train[keep_idxs] 

    # Remove features with 0 variance
    var = np.var(x_tr, axis = 0)
    x_tr = x_tr[:, var != 0]

    # Only keep 1 feature among highly correlated features
    corr_tri = np.triu(np.abs(np.corrcoef(x_tr, rowvar = False)), k = 1) # upper triangular correlation matrix (diagonal zeroed as well)
    max_corr = np.max(corr_tri, axis = 0)
    x_tr = x_tr[:, max_corr < lim_corr_features]

    # Remove features that have very low correlation with target value
    corr_mat = np.abs(np.corrcoef(np.c_[y_tr, x_tr], rowvar = False))
    x_tr = x_tr[:, corr_mat[0, 1:] > lim_corr_target] # first row of correlation matrix indicates correlation with target value vector y


    return x_tr, y_tr


In [None]:
# Change target variable to {0, 1} instead of {-1, 1}
# Remove outliers (9 typically represents unknown/missing information => remove datapoints with these kind of numbers)
# Standardize features ?
# Are the classes equal? If not equilibrate them

In [388]:
x_tr, y_tr = clean_data(x_train, y_train, 0.2, 0.95, 0.05)

In [390]:
x_tr.shape

(223447, 52)

In [335]:
# Remove columns with more than lim_col [%] of Nans
lim_col = 0.2
percentages = np.sum(np.isnan(x_train), axis = 0) / x_train.shape[0]
x_tr = x_train[:, percentages < lim_col]

In [336]:
# Remove datapoints (rows) with one or multiple Nans
keep_idxs = (np.sum(np.isnan(x_tr), axis = 1)) == 0
x_tr = x_tr[keep_idxs, :]
y_tr = y_train[keep_idxs] 

In [337]:
# Remove features with 0 variance
var = np.var(x_tr, axis = 0)
x_tr = x_tr[:, var != 0]

In [338]:
# Remove highly correlated features
lim_corr = 0.95
corr_tri = np.triu(np.abs(np.corrcoef(x_tr, rowvar = False)), k = 1) # upper triangular correlation matrix (diagonal zeroed as well)
max_corr = np.max(corr_tri, axis = 0)
x_tr = x_tr[:, max_corr < lim_corr] # 118 features


In [339]:
# Remove features that have very low correlation with target value
lim_corr = 0.05
corr_mat = np.abs(np.corrcoef(np.c_[y_tr, x_tr], rowvar = False))
# First row of correlation matrix indicates correlation with target value vector y
x_tr = x_tr[:, corr_mat[0, 1:] > lim_corr] # 52 features

In [355]:
np.max(x_tr, axis = 0)

array([  9.  ,  99.  ,   9.  ,   9.  ,   9.  ,   9.  ,   9.  ,   9.  ,
         9.  ,   2.  ,   9.  ,   9.  ,   2.  ,   9.  ,   9.  ,   9.  ,
        99.  ,   9.  ,   9.  ,   9.  ,   9.  ,   9.  ,   9.  ,   9.  ,
         9.  ,   9.  , 999.  , 555.  ,   9.  , 888.  ,   9.  ,   9.  ,
        23.  ,   9.  ,   9.  ,   9.  ,   9.  ,  14.  , 289.85,  97.65,
         4.  ,   2.  ,   9.  ,   9.  ,   9.  ,   9.  ,  99.  ,   9.  ,
         9.  ,   9.  ,   9.  ,   9.  ])

In [394]:
max_values = np.tile(np.max(x_tr, axis = 0), (x_tr.shape[0], 1))
#np.sum(x_tr[0,:] == np.max(x_tr, axis = 0)) == 0
np.sum(np.sum(x_tr == max_values, axis = 1) <= 5)

208327