In [4]:
import importlib
import implementations
importlib.reload(implementations)
from helpers import *
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# Load data
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('./data/dataset/dataset', sub_sample = False)

In [16]:
def clean_train_data(x_train, y_train, lim_nans, max_corr_features, min_corr_target, up_factor, down_factor):
    """ Clean the training data, that is remove some useless features, standardize data, balance classes, ...

    Args: 
        x_train: original training dataset
        y_train: label for each datapoint of x_train
        lim_nans: maximum fraction of nan values allowed for each feature
        max_corr_features: maximum absolute value of correlation allowed between two features
        min_corr_target: minimum absolute value of correlation allowed between a feature and the target vector (y_train)
        up_factor: upsampling factor for the minority class
        down_factor: downsampling factor for the majority class

    Returns: 
        tx_tr: cleaned training dataset
        y_tr: label for each datapoint of tx_tr
        kept_features: list of features that were kept in tx_tr
    """

    # Keep track of features that will be kept throughout the data cleaning process
    kept_features = np.array(range(x_train.shape[1]))

    # Remove columns with more than lim_nans of NaNs
    percentages = np.sum(np.isnan(x_train), axis = 0) / x_train.shape[0]
    x_tr = x_train[:, percentages < lim_nans]
    kept_features = kept_features[percentages < lim_nans]

    # Remove datapoints (rows) with any remaining NaN value
    num_nans = (np.sum(np.isnan(x_tr), axis = 1))
    x_tr = x_tr[num_nans == 0, :]
    y_tr = y_train[num_nans == 0] 

    # Remove features with 0 variance (they don't add any information)
    var = np.var(x_tr, axis = 0)
    x_tr = x_tr[:, var != 0]
    kept_features = kept_features[var != 0]

    # Standardise data along axis 0
    centered_data = x_tr - np.mean(x_tr, axis = 0)
    x_tr = centered_data / np.std(centered_data, axis = 0)

    # Only keep 1 feature among highly correlated features
    corr_tri = np.triu(np.abs(np.corrcoef(x_tr, rowvar = False)), k = 1) # upper triangular correlation matrix (diagonal zeroed as well)
    max_corr = np.max(corr_tri, axis = 0)
    x_tr = x_tr[:, max_corr < max_corr_features]
    kept_features = kept_features[max_corr < max_corr_features]

    # Remove features that have very low correlation with target value
    corr_mat = np.abs(np.corrcoef(y_tr, x_tr, rowvar = False))
    x_tr = x_tr[:, corr_mat[0, 1:] > min_corr_target] # first row of correlation matrix indicates correlation between target vector y_tr and each feature vector of x_tr
    kept_features = kept_features[corr_mat[0, 1:] > min_corr_target]

    # Oversample minority class and undersample majority class
    maj_idx = np.where(y_tr == -1)[0]
    min_idx = np.where(y_tr == 1)[0]

    idx_under = np.random.choice(maj_idx, size = int(maj_idx.shape[0] / down_factor), replace = False)
    idx_over = np.random.choice(min_idx, size = int(min_idx.shape[0] * up_factor), replace = True)
    idx_shuffled = np.random.permutation(np.concatenate([idx_under, idx_over]))

    x_tr = x_tr[idx_shuffled]
    y_tr = y_tr[idx_shuffled]

    # Make target variable take values in {0,1} instead of {-1,1}, that is map {-1,1} to {0,1}
    y_tr[y_tr == -1] = 0

    # Add offset term to x_tr
    tx_tr = np.c_[np.ones(x_tr.shape[0]), x_tr]

    return tx_tr, y_tr, kept_features


In [17]:
def prepare_test_data(x_test, kept_features):
    """ Prepare testing data

    Args:
        x_test: original testing dataset
        kept_features: list of all features that were kept in the process of cleaning the training data

    Return:
        tx_te: testing dataset prepared for applying model on it
    """
    
    # Keep only the features that were kept for the training data
    x_te = x_test[:, kept_features]

    # Replace nan values with median value for the corresponding feature
    medians = np.nanmedian(x_te, axis = 0)
    for column in range(x_te.shape[1]):
        x_te[:, column] = np.nan_to_num(x_te[:, column], nan = medians[column])

    # Standardise data along axis 0
    centered_data = x_te - np.mean(x_te, axis = 0)
    x_te = centered_data / np.std(centered_data, axis = 0)
    
    # Add offset term 
    tx_te = np.c_[np.ones(x_te.shape[0]), x_te]

    return tx_te

In [18]:
# Clean data
tx_tr, y_tr, kept_features = clean_train_data(x_train, y_train, 0.2, 0.95, 0.05, 4.7, 2)

In [36]:
# Train model with training data (regularized logistic regression)
w, loss = implementations.reg_logistic_regression(y_tr, tx_tr, lambda_ = 0.05, initial_w = np.ones(tx_tr.shape[1]), max_iters = 510, gamma = 0.5)

In [10]:
# Apply model to test data
tx_te = prepare_test_data(x_test, kept_features)

pred = np.where(tx_te.dot(w) > 0, 1, -1)
create_csv_submission(test_ids, pred, 'test2')

(109379, 53)

In [None]:
# TO DO

# Remove outliers (9 typically represents unknown/missing information => remove datapoints with these kind of numbers)
# Replace nans by median values instead of removing datapoints in the clean_training_data function 
# Modify reg_logistic_regression for it to stop if the error becomes stable before the max_iters

In [None]:
"""max_values = np.tile(np.max(x_tr, axis = 0), (x_tr.shape[0], 1))
#np.sum(x_tr[0,:] == np.max(x_tr, axis = 0)) == 0
np.sum(np.sum(x_tr == max_values, axis = 1) <= 5)"""