In [1]:
from helper import *
from cleaning import *
from metrics import *
from implementations import *
from definitions import ROOT_DIR

import os


#### Loading data

In [2]:
dataset_path = os.path.join(ROOT_DIR, 'dataset_to_release')
x_tr, x_te, y_tr, tr_ids, te_ids = load_csv_data(dataset_path, True)

#### Cleaning

In [3]:
clean_train = clean_continuous(x_tr, 0.8, False, 99, 0.9)

Finished running PCA: keeping 15 components to explain >99% variance


<Figure size 640x480 with 0 Axes>

In [4]:
clean_x_tr, clean_x_te, y_tr_set, y_te_set = split_data(clean_train, y_tr, 0.8)

#### Run and evaluate

In [5]:
best_params = {'gradient descent' : [0.01, 0.7], 
               'stochastic gradient descent': [0.01, 0.7],
               'least squares': [0.01, 0.7],
               'ridge regression': [0.01, 0.7],
               'logistic regression': [0.01, 0.7],
               'reg logistic regression': [0.01, 0.7]}#complete w our results as ['model', lambda_, gamma]

models = ['gradient descent', 'stochastic gradient descent', 'least squares', 'ridge regression',  'logistic regression', 'reg logistic regression']
mapping_threshold = 0

In [6]:
initial_w = np.zeros(clean_x_tr.shape[1])

for model in models:
    lambda_ = best_params[model][0]
    gamma = best_params[model][1]
    print(f'working on {model}')
    w, loss = train(model,y_tr_set, clean_x_tr, initial_w, 500, gamma, lambda_)
    train_predictions = np.where(np.dot(clean_x_tr,w) < mapping_threshold, -1, 1)
    test_predictions = np.where(np.dot(clean_x_te, w) < mapping_threshold, -1, 1)
    tr_err = y_tr_set - np.dot(clean_x_tr, w)
    te_err = y_te_set - np.dot(clean_x_te, w)
    train_rmse = np.sqrt(2*hp.compute_mse(tr_err))
    test_rmse = np.sqrt(2*hp.compute_mse(te_err))
    print('')
    print(f'---------- {model.upper()} ----------')
    print(f'train rmse = {train_rmse} - test rmse = {test_rmse}')
    train_metrics = calculate_metrics(y_tr_set, train_predictions)
    test_metrics = calculate_metrics(y_te_set, test_predictions)
    print('=========== TRAIN metrics ===========')
    prettyprint(train_metrics)
    print('=========== TEST metrics ===========')
    prettyprint(test_metrics)
    print('')

working on gradient descent

---------- GRADIENT DESCENT ----------
train rmse = 6.8373768375749256e+84 - test rmse = 6.511396406278432e+84
Accuracy: 0.5527619047619048 - F1 score 0.1003831417624521
Specificity: 0.5752543076603696 - Sensitivity: 0.302540415704388
Precision: 0.060174552135966924
Accuracy: 0.5628332063975628 - F1 score 0.13813813813813813
Specificity: 0.5848101265822785 - Sensitivity: 0.359375
Precision: 0.08550185873605948

working on stochastic gradient descent

---------- STOCHASTIC GRADIENT DESCENT ----------
train rmse = 4.366162243260519e+151 - test rmse = 4.331190991892111e+151
Accuracy: 0.5045714285714286 - F1 score 0.11920081273281408
Specificity: 0.5133900768112933 - Sensitivity: 0.4064665127020785
Precision: 0.06984126984126984
Accuracy: 0.5163747143945163 - F1 score 0.14535666218034993
Specificity: 0.5265822784810127 - Sensitivity: 0.421875
Precision: 0.08780487804878048

working on least squares

---------- LEAST SQUARES ----------
train rmse = 0.99787428089