In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from implementations import *
from losses import *
from EDA import *
from plots import *
from cross_validation import *
%load_ext autoreload
%autoreload 2

# Graphical exploration

#### Load the training data into feature matrix, class labels, and event ids

***

In [None]:
# Load the tarin data set
DATA_TRAIN_PATH = "../data/train.csv"
data_y, data_set, ids = load_csv_data(DATA_TRAIN_PATH)

# Classification of the output
y_0, y_1, y_2, y_3 = y_classification(data_y, data_set)

# EDA for each class
class_0, class_1, class_2, class_3 = EDA_class(data_set, exploratory=True)

## Correlation matrix
***

In [None]:
correlation_plot(class_0)

**Remark:** There is no particular correlation to notice betweeen the different features of the class 0.

## Histograms

***

In [None]:
# Check if some feature seems to be useless in this classification
histogram_plot(y_0, class_0, [0, 2])

**Remark:**  We can notice that the feature 0 seems to very decisive in the classification, but it is not the case for the feature 2. Then, we can remove this last without losing information for our classification.
We apply this idea to all the features and we obtain the following list of the features that we can remove: [2, 5, 8, 11, 13, 14]. For seeing the histograms of all the features, you just have to run the following piece of code in a cell: 'histogram_plot(y_0, class_0, [0, 2])'.

# Cross validation

Now, the graphical analysis is done, we can use cross validation to select our best model (with its corresponding method).
***

In [None]:
# Classification of the output
y_0, y_1, y_2, y_3 = y_classification(data_y, data_set, sub_sample=True)
# Apply the exploratory graphical analysis
class_0, class_1, class_2, class_3 = EDA_class(data_set, exploratory=False)

In [None]:
# Initialization of some parameters
max_degree = 6
mse = MSE()
neg_log = Neg_log()
kfold = 4

#### Least squares method

***

In [None]:
# Parameters initialization
param_least = Parameters()
param_least.set_degree(max_degree)
param_least.set_method(least_squares_)
param_least.set_k_fold(kfold)
param_least.set_viz(False)
param_least.set_use_backward_selection(True)
param_least.set_use_interactions(True)

# Cross validation
param_least = cross_validation_poly(y_0, class_0, param_least)


print("Best train error: ", param_least.best_train_error)
print("Best test error: ", param_least.best_error)
print('Std: ', param_least.std)


#### Gradient descent method

***

In [None]:
# Parameters initialization
param_GD = Parameters()
param_GD.set_degree(max_degree)
param_GD.set_loss_fct(mse)
param_GD.set_k_fold(kfold)
param_GD.set_method(least_squares_GD_)
param_GD.set_to_test(['gamma'])
param_GD.set_viz(False)
param_GD.set_use_backward_selection(True)
param_GD.set_use_interactions(True)

# Cross validation
param_GD = cross_validation_poly(y_0, class_0, param_GD)

print("Best train error: ", param_GD.best_train_error)
print("Best test error: ", param_GD.best_error)
print('Std: ', param_GD.std)

#### Stochastic gradient descent

***

In [None]:
# Parameters initialization
param_SGD = Parameters()
param_SGD.set_degree(max_degree)
param_SGD.set_k_fold(kfold)
param_SGD.set_loss_fct(mse)
param_SGD.set_method(least_squares_SGD_)
param_SGD.set_to_test(['gamma'])
param_SGD.set_viz(False)
param_SGD.set_use_backward_selection(True)
param_SGD.set_use_interactions(True)

# Cross validation
param_SGD = cross_validation_poly(y_0, class_0, param_SGD)

print("Best train error: ", param_SGD.best_train_error)
print("Best test error: ", param_SGD.best_error)
print('Std: ', param_SGD.std)

#### Ridge regression

***

In [None]:
# Parameters initialization
param_ridge = Parameters()
param_ridge.set_degree(max_degree)
param_ridge.set_loss_fct(mse)
param_ridge.set_k_fold(kfold)
param_ridge.set_method(ridge_regression_)
param_ridge.set_to_test(['lambda'])
param_ridge.set_viz(False)
param_ridge.set_use_backward_selection(True)
param_ridge.set_use_interactions(True)

# Cross validation
param_ridge = cross_validation_poly(y_0, class_0, param_ridge)

print("Best train error: ", param_ridge.best_train_error)
print("Best test error: ", param_ridge.best_error)
print('Std: ', param_ridge.std)

In [None]:
# Parmeters initialization
param_log = Parameters()
param_log.set_degree(max_degree)
param_log.set_loss_fct(mse)
param_log.set_k_fold(kfold)
param_log.set_method(logistic_regression_)
param_log.set_to_test(['gamma'])
param_log.set_viz(False)
param_log.set_use_backward_selection(True)
param_log.set_use_interactions(True)

# Cross validation
param_log = cross_validation_poly(y_0, class_0, param_log)

print("Best train error: ", param_log.best_train_error)
print("Best test error: ", param_log.best_error)
print('Std: ', param_log.std)

In [None]:
def classic_cv(y_, class_, parameters, idx):
    seed = parameters.seeds[0]

    # split data in k fold
    k_indices = build_k_indices(y_, parameters.k_fold, seed)
    # define lists to store the loss of training data and test data
    error_te = []
    error_tr = []

    for param in parameters.range(idx):
        parameters.set_param(idx, param)
        error_tr_i = [-1]
        error_te_i = [-1]

        for k in range(parameters.k_fold):
            # cross validation:
            percentage_error_tr, percentage_error_te = \
                method_evaluation(y_, class_, parameters, k_indices, k)
            error_tr_i = np.c_[error_tr_i, [percentage_error_tr]]
            error_te_i = np.c_[error_te_i, [percentage_error_te]]

        error_tr.append(np.mean(error_tr_i[0, 1:]))
        error_te.append(np.mean(error_te_i[0, 1:]))

    best_param = parameters.range(idx)[np.argmin(error_te)]
    parameters.set_best_param(idx, best_param)
    parameters.set_param(idx, best_param)
    parameters.set_best_error(np.min(error_te))

    # Display the results
    min_test_error = np.min(error_te)
    if parameters.viz:
        print('Test error: ' +str(min_test_error)+ '\nBest ' \
            +str(parameters.names[idx-1])+ ': ' +str(parameters.best_param(idx)))

    # Visualization
    if parameters.viz:
        cross_validation_visualization(parameters.range(idx), error_tr, error_te, parameters)
    
    return parameters