In [5]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# **Load the training data**

We load the training data.

In [6]:
import datetime
from implementations import *

data_load = load_data("x_train.csv")
pred = load_data("y_train.csv")
print(f"The data has {data_load.shape[0]} samples and {data_load.shape[1]} features !")

The data has 328135 samples and 321 features !


We clean the data by removing the nan values by the mean of the rest of the feature

In [7]:
data = np.ones(data_load.shape)
stds = np.array([])
for i in range(data.shape[1]):
    d, std = standardize_clean(data_load[:, i])
    data[:, i] = d
    stds = np.append(stds, std)
print(stds.shape)

(321,)


We further clean the data by removing the features where the variance is zero since they are constants for all samples

In [8]:
indices = np.where(stds != 0)
data_var = data[:, indices]
data_var = np.squeeze(data_var, axis = 1)
print(data_var.shape)

(328135, 315)


We also remove the 8 first features as the appear weird in the task of predicting a heart attack

In [9]:
data_cleaned = data_var[:, 9:]
print(data_cleaned.shape)

(328135, 306)


In [10]:
train_size = np.floor(data_cleaned.shape[0] * 0.6).astype(int)
data_cross = data_cleaned[:train_size, :]
pred_cross = pred[:train_size]
data_test = data_cleaned[train_size:, :]
pred_test = pred[train_size:]
print(f"Cross shape : {data_cross.shape} ; Test shape : {data_test.shape}")

Cross shape : (196881, 306) ; Test shape : (131254, 306)


Now we build our models for linear regression

In [12]:
y, tx = build_model_data(data_cross, pred_cross)
y_test, tx_test = build_model_data(data_test, pred_test)
print(f"The data has now {tx.shape[1]} features !")

The data has now 307 features !


## Linear regression using ridge regression

Here we train our model using ridge regression with normal equations

In [13]:
accuracy_RR = []
maxAccuracy = 0
lambda_values = np.arange(0, 3, 0.01)

start_time = datetime.datetime.now()

for i in lambda_values:
    w = ridge_regression(y, tx, i)
    
    actual_pred = np.dot(tx, w)
    actual_pred = (actual_pred > 0.5).astype(float)
    correctly_classified = np.sum(actual_pred == pred)
    accuracy = correctly_classified / len(pred)
    
    if maxAccuracy < accuracy :
        maxAccuracy = accuracy
        best_lambda = i
    
    accuracy_RR.append(accuracy)

end_time = datetime.datetime.now()


NameError: name 'ridge_regression' is not defined

In [None]:
plt.plot(lambda_values, accuracy_RR)
plt.xlabel('Lambda')
plt.ylabel('Accuracy')
plt.title('Ridge Regression: Accuracy vs Lambda')

print(f"Execution time {(end_time - start_time).total_seconds()} seconds")
print("The best accuracy in Ridge Regression is obtained for lambda =", format(best_lambda, '.2f'), "at", maxAccuracy)
