In [1]:
from helpers import *
import numpy as np
import pandas as pd
from implementations import *

import matplotlib.pyplot as plt


In [4]:
import os
os.getcwd()

'/home/imd/docs/univ/epfl/courses/ml/ML_project1/grading_tests'

In [None]:
data_path = '../data/dataset/dataset_to_release'
x_train_preclean, x_test_preclean, y_train, train_ids, test_ids = load_csv_data(data_path)

# Getting an idea of the data

In [None]:
print("X train", x_train_preclean.shape)
print("X test", x_test_preclean.shape)

In [None]:
## Find how many values are completely empty in column
def percentageFilled(data):
    return 1 - np.isnan(data).sum() / len(data)

percentage_filled = np.apply_along_axis(percentageFilled, 0, x_train_preclean)

plt.hist(percentage_filled, bins=20)
plt.title("Percentage of filled values per column")
plt.xlabel("Percentage")
plt.ylabel("# of columns")

# Data Cleaning

In [None]:
## Process data 
## 1. drop the columns with more than 80% missing values
def threshold_col_filter(data, threshold):
    """ 
    filter out data where the column has less than threshold percentage of data
    returns: 
        indicies of columns to keep
    """
    percentage_filled = np.apply_along_axis(percentageFilled, 0, data)
    # keep_indicies = np.argwhere(percentage_filled > threshold).flatten()
    return percentage_filled > threshold


def non_constant_filter(data):
    """
    filter out where the values in the column are all the same
    """
    return np.logical_not(np.logical_or(np.isnan(np.nanstd(data, 0)), np.nanstd(data, 0) == 0))

# TODO uncorrelation?
    

# TODO correlation w
## SEE LATER, done at a later stage, after these two steps  


keep_indicies = np.argwhere(np.logical_and(
    threshold_col_filter(x_train_preclean, 0.2), 
    non_constant_filter(x_train_preclean))
).flatten()


In [None]:
def standardize(x):
    """Standardize the original data set."""
    std = np.nanstd(x, axis=0)
    mean = np.nanmean(x, axis=0)
    return np.nan_to_num((x - np.nanmean(x, axis=0)) / np.nanstd(x, axis=0)), mean, std

In [None]:
def transform_train(feature):
    m = dict()
    for x in feature:
        if x not in m:
            m[x] = len(m)
    f = np.vstack((np.eye(len(m)), np.zeros(len(m))))
    u = f[np.vectorize(lambda key: m.get(key, len(m)))(feature)]
    return u, m


def transform_test(feature, m):
    n_uniq = len(m)
    f = np.vstack((np.eye(n_uniq), np.zeros(n_uniq)))
    ind = np.array([m[k] if k in m else n_uniq for k in feature])
    return f[ind]

In [None]:
def process_train(data, cat_threshold = 10):
    n, m = data.shape
    filter = np.logical_and(threshold_col_filter(data, 0.2), non_constant_filter(data))
    categorical_filter = np.apply_along_axis(lambda x: len(set(x)) < cat_threshold, 0, data)
    cat_transform = dict()
    num_transform = dict()
    res = np.empty((n, 0))
    for i in range(m):
        if not filter[i]:
            continue
        if categorical_filter[i]:
            encoded, mp = transform_train(data[:, i])
            cat_transform[i] = mp
            res = np.append(res, encoded, axis=1)
        else:
            x_num_std, mean, std = standardize(data[:, i])
            x_num_std[abs(x_num_std) > 3] = 0
            num_transform[i] = (mean, std)
            res = np.append(res, x_num_std.reshape((n,1)), axis=1)
    return res, filter, categorical_filter, num_transform, cat_transform


def process_test(data, filter, categorical_filter, num_transform, cat_transform):
    n, m = data.shape
    res = np.empty((n, 0))
    for i in range(m):
        if not filter[i]:
            continue
        if categorical_filter[i]:
            res = np.append(res, transform_test(data[:, i], cat_transform[i]), axis=1)
        else:
            mean, std = num_transform[i] # std shouldn't be 0
            res = np.append(res, np.nan_to_num((data[:, i] - mean) / std).reshape((n,1)), axis=1)
    return res

In [None]:
x_train, feature_filter, categorical_filter, num_transform, cat_transform = process_train(x_train_preclean)

x_test = process_test(x_test_preclean, feature_filter, categorical_filter, num_transform, cat_transform)

In [None]:
print(f"# x_train shape: {x_train.shape}")

### correlation 

In [None]:
# def feature_correlation(x1, x2):
#     return np.abs(np.corrcoef(x1, x2, rowvar=False))
# 
# cr  =  feature_correlation(x_train_std, y_train)[-1, :-1]
# plt.hist(cr, bins=20)

In [None]:
# ## Screen out features based on correlation
# good_corre_indicies = np.argwhere(cr > 0.05).flatten()
# print("good_corre_indicies", good_corre_indicies)
# x_train_corre = x_train_std[:, good_corre_indicies]

# Logistic regression *without* regularization

In [None]:
initial_w = np.zeros(x_train.shape[1], dtype=np.float128)
max_iters = 100
gamma = 0.5

In [None]:
## Here the logistic regression is from implementations.py
## 
w, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)
print("loss is ", loss)


### Trying to predict x_test

In [None]:
def prediction_labels(weights, data):  ## isn't this for linear regression only ? Don't we need the sigmoid?
    """Generates class predictions given weights, and a test data matrix."""
    y_pred = sigmoid(np.dot(data, weights))
    y_pred[np.where(y_pred >= 0.5)] = 1
    y_pred[np.where(y_pred < 0.5)] = 0
    return y_pred

y_pred = prediction_labels(w, x_train)


In [None]:
def accuracy(y_pred, y_train):
    return (y_pred == y_train).sum() / len(y_train)
def precision(y_pred, y_train):
    TP = np.sum((y_train==1) & (y_pred==1))
    FP = np.sum((y_train==0) & (y_pred==1))
    return TP/(TP+FP)
def recall(y_pred, y_train):
    recall = np.sum((y_train==1) & (y_pred==1)) / np.sum(y_train==1)
    return recall
def f_score (y_pred, y_train):
    return 2*precision(y_pred, y_train)*recall(y_pred, y_train) / (precision(y_pred, y_train) + recall(y_pred, y_train))

print("accuracy", accuracy(y_pred, y_train))
print("precision", precision(y_pred, y_train))
print("recall", recall(y_pred, y_train))
print("f_score", f_score(y_pred, y_train))

## Generate trained data 

In [None]:
##
y_pred = prediction_labels(w, x_test)
y_pred[y_pred == 0] = -1

In [None]:
create_csv_submission(test_ids, y_pred, 'submission.csv')

### Logistic regression with regularization