In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime

import sys

sys.path.append('../')

from implementations import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# **Load and clean the training data**

We load the training data.

In [3]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../data")

print(f"The data has {x_train.shape[0]} samples and {x_train.shape[1]} features !")

The data has 328135 samples and 321 features !


We then clean the data by : 
- selecting the best 43 features with human decision and correlation analysis
- removing the nan values by the median of the rest of the feature for continuous values
- removing the nan values by the -1 for categorical features
- removing the features where the variance is zero since they are constants for all samples

In [4]:
cont_data = [13, 15, 16, 17, 24 ,25 ,26 ,27 ,28 ,29 ,33 ,37,49 ,59 ,60 ,62 ,63 ,75 ,77 ,78 ,
             79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,89 ,90 ,92 ,93 ,94, 98 ,110 ,111 ,112 ,113 ,114 ,143
            ,147 ,148 ,149 ,150 ,168 ,195 ,197 , 206 ,207 ,208 ,209 ,210 ,211 ,212 ,213 ,219, 220, 221
            ,222, 226, 228, 229, 248 ,250 ,251 ,252 ,253 ,262 ,264 ,266,267,268,269,270,271,276,277,285,286,287,288,291,292, 293,294, 295, 296, 297, 299, 300, 301, 302, 303, 304]

# Generate a list of all indices from 1 to 320
all_indices = list(range(0, 321))

# Use a list comprehension to filter out indices not in your array
cat_data = [idx for idx in all_indices if idx not in cont_data]

In [5]:
xt_feat = x_train
xt_feat = preprocessing(xt_feat)
xt_feat = gen_clean(xt_feat, cat_data, cont_data)
print(f"The data has now {xt_feat.shape[1]} features !")

The data has now 321 features !


In [6]:
xt_z = xt_feat.copy()
for i in np.array([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 19, 20, 21, 22, 23, 88, 91, 101, 105]):
    xt_z[:, i] = 0

In [None]:
xt_feat_sep = cat_sep(xt_z, cat_data)
print(f"The data has now {xt_feat_sep.shape[1]} features !")

In [8]:
separated_categories = np.delete(xt_feat_sep, cat_data, axis=1)

# PCA algorithm implementation

In [9]:
pca_indices, idx = pca(separated_categories)
print(f"We can keep the {idx} first most influent features given by pca_indices")

We can keep the 138 first most influent features given by pca_indices


In [11]:
x_train_pca = separated_categories[:, pca_indices]
x_train_pca = separated_categories[:, :idx]

print(f"The data has now {x_train_pca.shape[1]} features")

The data has now 138 features


# Ridge regression

We find the Ridge regression solutions using the normal equations.

First, we separate our data in a training set (70%) and testing set (30%).

In [12]:
tx_tr, tx_te, y_tr, y_te = cross(x_train_pca, y_train, 0.8)

print(f"tx_tr shape : {tx_tr.shape} ; tx_te shape : {tx_te.shape}")
print(f"       y_tr : {y_tr.shape}     ;        y_te : {y_te.shape}")

tx_tr shape : (262508, 138) ; tx_te shape : (65627, 138)
       y_tr : (262508,)     ;        y_te : (65627,)


Now we build our models for linear regression

In [13]:
y, tx = build_model_data(tx_tr, y_tr)
y_test, tx_test = build_model_data(tx_te, y_te)
print(f"The data has now {tx.shape[1]} features !")

The data has now 139 features !


## Linear regression using ridge regression

Here we train our model using ridge regression with normal equations

In [14]:
start_time = datetime.datetime.now()

degrees = np.arange(2, 8)
lambdas = np.logspace(-4, 0, 30)

best_degree, best_lambda, best_rmse = best_degree_selection(y, tx, degrees, 4, lambdas)

end_time = datetime.datetime.now()

print(
    "The best rmse of %.3f is obtained for a degree of %.f and a lambda of %.5f."
    % (best_rmse, best_degree, best_lambda)
)

print(f"Execution time {(end_time - start_time).total_seconds()} seconds")

Degree 2 done !
Degree 3 done !
Degree 4 done !
Degree 5 done !
Degree 6 done !
Degree 7 done !
The best rmse of 0.817 is obtained for a degree of 2 and a lambda of 0.07880.
Execution time 8178.622334 seconds


In [21]:
best_degree = 2
best_lambda = 0.07880

tx_tr = build_poly(tx, best_degree)
tx_te_poly = build_poly(tx_test, best_degree)

w, loss = ridge_regression(y, tx_tr, best_lambda)

In [None]:
best_degree = 2
best_lambda = 0.01610

tx_tr = build_poly(tx, best_degree)
tx_te_poly = build_poly(tx_test, best_degree)

w, loss = ridge_regression(y, tx_tr, best_lambda)

In [22]:
print(tx_tr.shape)
print(tx_te_poly.shape)

(262508, 279)
(65627, 279)


# Computation of metrics

We first compute some metrics on the training data (60% of the total data)



In [18]:
best_thresh = best_threshold(y, tx_tr, w)

In [23]:
pred_data = np.dot(tx_tr, w)

pred_data[pred_data > best_thresh] = 1
pred_data[pred_data < best_thresh] = -1

correctly_classified_data = np.sum(pred_data == y)

tp = np.sum((pred_data == 1) & (y == 1))
fp = np.sum((pred_data == 1) & (y == -1))

tn = np.sum((pred_data == -1) & (y == -1))
fn = np.sum((pred_data == -1) & (y == 1))

accuracy_data = (tp + tn)/(tp + fp + tn + fn)

print(f"Accuracy: {accuracy_data*100}%")
print(f"Precision: {tp/(tp + fp)*100}%")
print(f"Recall : {tp/(tp + fn)*100}%")
print(f"F1-score : {tp/(tp + 0.5*(fn + fp))*100}%")

Accuracy: 86.16118365916468%
Precision: 32.78197447251889%
Recall : 54.46165830015579%
F1-score : 40.92816026537449%


Now we compute some metrics for our test data (40% of the total data)

In [20]:
pred_test = np.dot(tx_te_poly, w)

pred_test[pred_test > best_thresh] = 1
pred_test[pred_test < best_thresh] = -1

correctly_classified_test = np.sum(pred_test == y_test)

tp = np.sum((pred_test == 1) & (y_test == 1))
fp = np.sum((pred_test == 1) & (y_test == -1))

tn = np.sum((pred_test == -1) & (y_test == -1))
fn = np.sum((pred_test == -1) & (y_test == 1))

accuracy_test = (tp + tn)/(tp + fp + tn + fn)

print(f"Accuracy: {accuracy_test*100}%")
print(f"Precision: {tp/(tp + fp)*100}%")
print(f"Recall : {tp/(tp + fn)*100}%")
print(f"F1-score : {tp/(tp + 0.5*(fn + fp))*100}%")

Accuracy: 86.16727871150593%
Precision: 33.39882121807465%
Recall : 55.05369013124254%
F1-score : 41.57549234135667%


# **Prediction on test data**

In [22]:
start_time = datetime.datetime.now()

tx_test = x_test
tx_test = preprocessing(x_test)
tx_test = gen_clean(tx_test, [], np.arange(321))

tx_test = tx_test[:, pca_indices]
tx_test = tx_test[:, :idx]
tx_test = np.c_[np.ones(tx_test.shape[0]), tx_test]

tx_test = build_poly(tx_test, best_degree)

end_time = datetime.datetime.now()
print(f"Execution time {(end_time - start_time).total_seconds()} seconds")
print(f"The data has {tx_test.shape[0]} samples and {tx_test.shape[1]} features !")

Execution time 3.531589 seconds
The data has 109379 samples and 259 features !


In [23]:
pred_te = np.dot(tx_test, w)

pred_te[pred_te > best_thresh] = 1
pred_te[pred_te < best_thresh] = -1

indices_one = np.where(pred_te == 1)

In [24]:
create_csv_submission(test_ids, pred_te, "../data/ridge_reg.csv")