In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime

import sys

sys.path.append('../')

from implementations import *

%load_ext autoreload
%autoreload 2

# **Load and clean the training data**

We load the training data.

In [4]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../data")

print(f"The data has {x_train.shape[0]} samples and {x_train.shape[1]} features !")

The data has 328135 samples and 321 features !


We then clean the data by : 
- removing the nan values by the mean of the rest of the feature
- removing the features where the variance is zero since they are constants for all samples
- remove the 8 first features as the appear weird in the task of predicting a heart attack

In [None]:
xt_feat = x_train
xt_feat = preprocessing(xt_feat)
xt_feat = gen_clean(xt_feat, [], np.arange(321))
print(f"The data has now {xt_feat.shape[1]} features !")

# PCA algorithm implementation

In [None]:
pca_indices, idx = pca(xt_feat)
print(f"We can keep the {idx} first most influent features given by pca_indices")

In [None]:
x_train_pca = xt_feat[:, pca_indices]
x_train_pca = xt_feat[:, :idx]

print(f"The data has now {x_train_pca.shape[1]} features")

# Logistic regression using stochastic gradient descent (SGD)

We train our model using logistic regression using SGD with mean-square error.

First, we separate our data in a training set(70%) and testing set(30%).

In [None]:
tx_tr, tx_te, y_tr, y_te = cross(x_train_pca, y_train, 0.48)

print(f"tx_tr shape : {tx_tr.shape} ; tx_te shape : {tx_te.shape}")
print(f"       y_tr : {y_tr.shape}     ;        y_te : {y_te.shape}")

Now we build our models for linear regression using SGD

In [10]:
y, tx = build_model_data(tx_tr, y_tr)
y_test, tx_test = build_model_data(tx_te, y_te)
print(f"The data has now {tx.shape[1]} features !")

The data has now 191 features !


## Training 

Here we train our model using SGD with MSE

In [None]:
initial_w = np.zeros(tx.shape[1])
degree = range(1,4)
gammas = np.logspace(-5, 0, 15)
max_iters = 2000

start_time = datetime.datetime.now()
best_degree, best_gamma, best_loss = logistic_regression_demo(tx, y, gammas, degree, max_iters)
end_time = datetime.datetime.now()

print(
    "The best rmse of %.3f is obtained for a degree of %.f and a lambda of %.5f."
    % (best_loss, best_degree, best_gamma)
)

print(f"Execution time {(end_time - start_time).total_seconds()} seconds")

In [11]:
best_gamma = 1e-05
initial_w = np.zeros(tx.shape[1])
max_iters = 2000

w, loss = logistic_regression(y, tx, initial_w, max_iters, best_gamma)

Execution time 106.096183 seconds


### Computation of metrics

We first compute some metrics on the training data (60% of the total data)

In [None]:
best_thresh = best_threshold(y, tx, w)

In [87]:
pred_data = np.dot(tx, w)

pred_data[pred_data > best_thresh] = 1
pred_data[pred_data <= best_thresh] = -1

correctly_classified_data = np.sum(pred_data == y)

tp = np.sum((pred_data == 1) & (y == 1))
fp = np.sum((pred_data == 1) & (y == -1))

tn = np.sum((pred_data == -1) & (y == -1))
fn = np.sum((pred_data == -1) & (y == 1))

accuracy_data = (tp + tn)/(tp + fp + tn + fn)

print(f"Accuracy: {accuracy_data*100}%")
print(f"Precision: {tp/(tp + fp)*100}%")
print(f"Recall : {tp/(tp + fn)*100}%")
print(f"F1-score : {tp/(tp + 0.5*(fn + fp))*100}%")

Accuracy: 84.80736997155628%
Precision: 29.611122305057425%
Recall : 53.20782041998552%
F1-score : 38.04789644012945%


Now we compute some metrics for our test data (40% of the total data)

In [88]:
pred_test = np.dot(tx_test, w)

pred_test[pred_test > best_thresh] = 1
pred_test[pred_test <= best_thresh] = -1

correctly_classified_test = np.sum(pred_test == y_test)

tp = np.sum((pred_test == 1) & (y_test == 1))
fp = np.sum((pred_test == 1) & (y_test == -1))

tn = np.sum((pred_test == -1) & (y_test == -1))
fn = np.sum((pred_test == -1) & (y_test == 1))

accuracy_test = (tp + tn)/(tp + fp + tn + fn)

print(f"Accuracy: {accuracy_test*100}%")
print(f"Precision: {tp/(tp + fp)*100}%")
print(f"Recall : {tp/(tp + fn)*100}%")
print(f"F1-score : {tp/(tp + 0.5*(fn + fp))*100}%")

Accuracy: 84.63585163305612%
Precision: 29.38323196895638%
Recall : 51.92878338278932%
F1-score : 37.530381737597104%


# Prediction on test data

In [None]:
start_time = datetime.datetime.now()

tx_test = x_test
tx_test = preprocessing(x_test)
tx_test = gen_clean(tx_test, [], np.arange(321))

tx_test = tx_test[:, pca_indices]
tx_test = tx_test[:, :idx]
tx_test = np.c_[np.ones(tx_test.shape[0]), tx_test]

end_time = datetime.datetime.now()
print(f"Execution time {(end_time - start_time).total_seconds()} seconds")
print(f"The data has {tx_test.shape[0]} samples and {tx_test.shape[1]} features !")

In [None]:
pred_te = np.dot(tx_test, w)

pred_te[pred_te > best_thresh] = 1
pred_te[pred_te < best_thresh] = -1

indices_one = np.where(pred_te == 1)

In [94]:
create_csv_submission(test_ids, pred_te, "../data/log_reg.csv")