In [176]:
import importlib
import implementations
importlib.reload(implementations)
from helpers import load_csv_data
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('./data/dataset/dataset', sub_sample = False)

In [177]:
def clean_data(x_train, y_train, lim_nans, lim_corr_features, lim_corr_target):

    # Kept fatures
    kept_features = np.array(range(x_train.shape[1]))
    # Remove columns with more than lim_nans of Nans
    percentages = np.sum(np.isnan(x_train), axis = 0) / x_train.shape[0]
    x_tr = x_train[:, percentages < lim_nans]
    kept_features = kept_features[percentages < lim_nans]

    # Remove datapoints (rows) with one or multiple Nans
    keep_idxs = (np.sum(np.isnan(x_tr), axis = 1)) == 0
    x_tr = x_tr[keep_idxs, :]
    y_tr = y_train[keep_idxs] 

    # Remove features with 0 variance
    var = np.var(x_tr, axis = 0)
    x_tr = x_tr[:, var != 0]
    kept_features = kept_features[var != 0]

    # Only keep 1 feature among highly correlated features
    corr_tri = np.triu(np.abs(np.corrcoef(x_tr, rowvar = False)), k = 1) # upper triangular correlation matrix (diagonal zeroed as well)
    max_corr = np.max(corr_tri, axis = 0)
    x_tr = x_tr[:, max_corr < lim_corr_features]
    kept_features = kept_features[max_corr < lim_corr_features]

    # Remove features that have very low correlation with target value
    corr_mat = np.abs(np.corrcoef(np.c_[y_tr, x_tr], rowvar = False))
    x_tr = x_tr[:, corr_mat[0, 1:] > lim_corr_target] # first row of correlation matrix indicates correlation with target value vector y
    kept_features = kept_features[corr_mat[0, 1:] > lim_corr_target]

    # Standardise data along axis 0
    centered_data = x_tr - np.mean(x_tr, axis = 0)
    x_tr = centered_data / np.std(centered_data, axis = 0)

    # Make target variable take values in {0,1} instead of {-1,1}, that is map {-1,1} to {0,1}
    y_tr[y_tr == -1] = 0

    # Add offset term to x_tr
    x_tr = np.c_[np.ones(x_tr.shape[0]), x_tr]

    return x_tr, y_tr, kept_features


In [None]:
# Remove outliers (9 typically represents unknown/missing information => remove datapoints with these kind of numbers)
# Standardize features ?
# Are the classes equal? If not equilibrate them

In [178]:
tx_tr, y_tr, kept_features = clean_data(x_train, y_train, 0.2, 0.95, 0.05)

In [186]:
x_test[:, kept_features].shape

(109379, 52)

In [187]:
w, loss = implementations.reg_logistic_regression(y_tr, tx_tr, lambda_ = 0.05, initial_w = np.ones(tx_tr.shape[1]), max_iters = 510, gamma = 0.5)

In [335]:
# Remove columns with more than lim_col [%] of Nans
lim_col = 0.2
percentages = np.sum(np.isnan(x_train), axis = 0) / x_train.shape[0]
x_tr = x_train[:, percentages < lim_col]

In [336]:
# Remove datapoints (rows) with one or multiple Nans
keep_idxs = (np.sum(np.isnan(x_tr), axis = 1)) == 0
x_tr = x_tr[keep_idxs, :]
y_tr = y_train[keep_idxs] 

In [337]:
# Remove features with 0 variance
var = np.var(x_tr, axis = 0)
x_tr = x_tr[:, var != 0]

In [338]:
# Remove highly correlated features
lim_corr = 0.95
corr_tri = np.triu(np.abs(np.corrcoef(x_tr, rowvar = False)), k = 1) # upper triangular correlation matrix (diagonal zeroed as well)
max_corr = np.max(corr_tri, axis = 0)
x_tr = x_tr[:, max_corr < lim_corr] # 118 features


In [339]:
# Remove features that have very low correlation with target value
lim_corr = 0.05
corr_mat = np.abs(np.corrcoef(np.c_[y_tr, x_tr], rowvar = False))
# First row of correlation matrix indicates correlation with target value vector y
x_tr = x_tr[:, corr_mat[0, 1:] > lim_corr] # 52 features

In [None]:
np.max(x_tr, axis = 0)

In [None]:
max_values = np.tile(np.max(x_tr, axis = 0), (x_tr.shape[0], 1))
#np.sum(x_tr[0,:] == np.max(x_tr, axis = 0)) == 0
np.sum(np.sum(x_tr == max_values, axis = 1) <= 5)