In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y_raw, X_raw, ids = load_csv_data(DATA_TRAIN_PATH)

# Exploratory data analysis

## Quantify missingness

In [3]:
N = X_raw.shape[0]
D = X_raw.shape[1]
null = -999

# No feature dropping yet

# Total number of rows for which there is at least one null value
points_missing = np.sum(np.any(X_raw==null, axis=1))
# Vector holding, for each feature, the fraction of datapoints with a null in that position
null_frac = (1/N) * np.sum(X_raw==null, axis=0)

print(N, "datapoints, of which", points_missing, "have at least one missing feature")
print("Fraction of null datapoints for each feature:\n", null_frac)

250000 datapoints, of which 181886 have at least one missing feature
Fraction of null datapoints for each feature:
 [0.152456 0.       0.       0.       0.709828 0.709828 0.709828 0.
 0.       0.       0.       0.       0.709828 0.       0.       0.
 0.       0.       0.       0.       0.       0.       0.       0.399652
 0.399652 0.399652 0.709828 0.709828 0.709828 0.      ]


## Clean dataset

In [4]:
from toolbox.manipulate_data import *

# There are only 4 different values in null_frac
drop_thresholds = [0.1, 0.33, 0.7, 1]
# Specific choice
drop_thresh = drop_thresholds[2]

# Clean dataset
X = clean_data(X_raw, null, drop_thresh)

#Verify that there are no missing values anymore
print("X.shape:", X.shape, ", has nulls:", np.any(X == null))

X.shape: (250000, 23) , has nulls: False


# Pre-process data

## Standardise data

In [5]:
# Standardise dataset
X, mean_X, std_X = standardise(X)
print(mean_X.shape, std_X.shape)

(23,) (23,)


## Map y to {0, 1}

In [6]:
# Map y from bilateral {-1, +1} domain to unilateral {0, 1} domain
print(y_raw)
y = bin_bil_to_unil(y_raw)
print(y)

[ 1. -1. -1. ...  1. -1. -1.]
[1. 0. 0. ... 1. 0. 0.]


## Expand features

In [15]:
# Expand features
degree = 10
tX = expand_features(X, degree)
print(tX.shape)

(250000, 231)


# Model selection

### Evaluate different training functions and hyperparameters with cross validation

In [16]:
from toolbox.training import *
from toolbox.testing import *

lambdas = np.logspace(-10, 2, 25)
max_iters = 100
batch_size = 200
gamma = 1e-6
class_thresh = 0.5
k_fold = 5
initial_w = np.zeros(tX.shape[1])


# Regularised Least-Squares with NE
print("Regularised Least-Squares with NE")
for lambda_ in lambdas:
    hyper_params = (lambda_, )
    avg_test_loss, avg_train_loss = cross_validation(y, tX, train_reg_ls_NE, hyper_params, 
                                                     class_thresh, k_fold, "classifier")

    print("log10_lambda = {log10_lam:.3f}: avg_test_loss = {avg_test_loss:.3f}, avg_train_loss = {avg_train_loss:.3f}".format(
    log10_lam=np.log10(lambda_), avg_test_loss=avg_test_loss, avg_train_loss=avg_train_loss))


# Regularised Logistic with GD
print("\nRegularised Logistic with SGD")
for lambda_ in lambdas:
    hyper_params = (lambda_, initial_w, max_iters, batch_size, gamma)
    avg_test_loss, avg_train_loss = cross_validation(y, tX, train_reg_log_SGD, hyper_params, 
                                                     class_thresh, k_fold, "classifier")

    print("log10_lambda = {log10_lam:.3f}: avg_test_loss = {avg_test_loss:.3f}, avg_train_loss = {avg_train_loss:.3f}".format(
    log10_lam=np.log10(lambda_), avg_test_loss=avg_test_loss, avg_train_loss=avg_train_loss))

Regularised Least-Squares with NE
log10_lambda = -10.000: avg_test_loss = 0.196, avg_train_loss = 0.163
log10_lambda = -9.500: avg_test_loss = 0.190, avg_train_loss = 0.079
log10_lambda = -9.000: avg_test_loss = 0.189, avg_train_loss = 0.074
log10_lambda = -8.500: avg_test_loss = 0.190, avg_train_loss = 0.074
log10_lambda = -8.000: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -7.500: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -7.000: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -6.500: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -6.000: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -5.500: avg_test_loss = 0.190, avg_train_loss = 0.071
log10_lambda = -5.000: avg_test_loss = 0.215, avg_train_loss = 0.118
log10_lambda = -4.500: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -4.000: avg_test_loss = 0.189, avg_train_loss = 0.071
log10_lambda = -3.500: avg_test_loss = 0.190, avg_train_loss = 0.071

KeyboardInterrupt: 

### Train using best method and hyperparameters

In [27]:
w, train_loss, regressor, classifier = train_unreg_ls_NE(y, tX, class_thresh)

print(train_loss)

0.08812559481524469


# Generate predictions

## Load challenge data

In [30]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, X_ch_raw, ids_ch = load_csv_data(DATA_TEST_PATH)

## Clean, standardise, and expand challenge dataset

In [31]:
# Clean dataset
X_ch = clean_data(X_ch_raw, null, drop_thresh)

# Standardise dataset
X_ch, mean_X_ch, std_X_ch = standardise(X_ch)

# Expand features
tX_ch = expand_features(X_ch, degree)

## Predict

In [32]:
# Get output in {0, 1}
y_unil = classifier(tX_ch)

# Map output to {-1, +1}
y_bil = bin_unil_to_bil(y_unil)

print(y_unil)
print(y_bil)

[0 0 0 ... 1 1 0]
[-1 -1 -1 ...  1  1 -1]


## Write output

In [34]:
# Write formatted output file
OUTPUT_PATH = 'submission.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_ch, y_bil, OUTPUT_PATH)

## Directly use run.py

In [17]:
%run run.py

Output file created
