In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from functions import *
from implementations import *
%load_ext autoreload
%autoreload 2

**1. LOAD THE DATA**

We load the data using the helper function `load_csv_data`. The returned array contains 3 arrays, on for the predictions, one for the features, and one for the ids.

In [2]:
train_set = load_csv_data('Data/train.csv', sub_sample = True)
test_set = load_csv_data('Data/test.csv', sub_sample = False)

OSError: Data/train.csv not found.

**2. SET UP THE DATA**

We separate the 3 arrays as mentionned above.

In [13]:
x_train = train_set[1]
y_train = train_set[0]
ids_train = train_set[2]

x_test = test_set[1]
y_test = test_set[0]
ids_test = test_set[2]

**3. CLEAN THE DATA**

According to our look at the data set, we saw that depending on the `PRI_jet_num` values, some features were undefined for all observations. Hence we decided to split the dataset into 3 subsets depending on the `PRI_jet_num`. This allows us to delete some features without losing any imoportant information.

In [14]:
""" The following separates the initial test set x_test in three subsets
    according the feature PRI_jet_num which takes its value in the 
    set {0,1,2,3} """

# Concatenating x, y and ids: 
complete_test = np.column_stack((y_test, x_test))
complete_test = np.column_stack((complete_test, ids_test))

# Split the data into 3 subsets depending on the value of the feature PRI_jet_num (featute number 23)
subset_test_0 = complete_test[complete_test[:,23] == 0]
subset_test_1 = complete_test[complete_test[:,23] == 1]
subset_test_23 = complete_test[2 <= complete_test[:,23]]

# Separate the three subsets to obtain the ids, the features and the prediction
y_test_0 = subset_test_0[:,0]
y_test_1 = subset_test_1[:,0]
y_test_2 = subset_test_23[:,0]

x_test_0 = subset_test_0[:,1:-1]
x_test_1 = subset_test_1[:,1:-1]
x_test_2 = subset_test_23[:,1:-1]

id_test_0 = subset_test_0[:,-1]
id_test_1 = subset_test_1[:,-1]
id_test_2 = subset_test_23[:,-1]

In [15]:
""" The following separates the initial training set x in three subsets
    according the feature PRI_jet_num which takes its value in the 
    set {0,1,2,3} """


# Concatenating x, y and ids: 
complete_train = np.column_stack((y_train, x_train))
complete_train = np.column_stack((complete_train, ids_train))

# Split the data into 3 subsets depending on the value of the feature PRI_jet_num (featute number 23)
subset_train_0 = complete_train[complete_train[:,23] == 0]
subset_train_1 = complete_train[complete_train[:,23] == 1]
subset_train_23 = complete_train[2 <= complete_train[:,23]]

# Separate the three subsets to obtain the ids, the features and the prediction
y_train_0 = subset_train_0[:,0]
y_train_1 = subset_train_1[:,0]
y_train_2 = subset_train_23[:,0]

x_train_0 = subset_train_0[:,1:-1]
x_train_1 = subset_train_1[:,1:-1]
x_train_2 = subset_train_23[:,1:-1]

id_train_0 = subset_train_0[:,-1]
id_train_1 = subset_train_1[:,-1]
id_train_2 = subset_train_23[:,-1]

In [16]:
""" The following clean the 3 subsets for the train and test data.
    It checks for the undefined values (-999) and for the zeros value"""

# Get the indices of features where the number of undefined values is higher than 95%
na_indices_0 = get_na_columns(x_train_0, 0.95, -999)
na_indices_1 = get_na_columns(x_train_1, 0.95, -999)
na_indices_2 = get_na_columns(x_train_2, 0.95, -999)

# Delete those features in the 3 train subsets and in the 3 test subsets
x_train_0_clean = np.delete(x_train_0, na_indices_0, axis = 1)
x_train_1_clean = np.delete(x_train_1, na_indices_1, axis = 1)
x_train_2_clean = np.delete(x_train_2, na_indices_2, axis = 1)

x_test_0_clean = np.delete(x_test_0, na_indices_0, axis = 1)
x_test_1_clean = np.delete(x_test_1, na_indices_1, axis = 1)
x_test_2_clean = np.delete(x_test_2, na_indices_2, axis = 1)

# Get the indices of features where the number of zero values is higher than 99%
zero_indices_0 = get_na_columns(x_train_0_clean, 0.99, 0.0)
zero_indices_1 = get_na_columns(x_train_1_clean, 0.99, 0.0)
zero_indices_2 = get_na_columns(x_train_2_clean, 0.99, 0.0)

# Delete those features in the 3 train subsets and in the 3 test subsets
x_train_0_clean = np.delete(x_train_0_clean, zero_indices_0, axis = 1)
x_train_1_clean = np.delete(x_train_1_clean, zero_indices_1, axis = 1)
x_train_2_clean = np.delete(x_train_2_clean, zero_indices_2, axis = 1)

x_test_0_clean = np.delete(x_test_0_clean, zero_indices_0, axis = 1)
x_test_1_clean = np.delete(x_test_1_clean, zero_indices_1, axis = 1)
x_test_2_clean = np.delete(x_test_2_clean, zero_indices_2, axis = 1)

In [17]:
# For the 3 train subsets, get the indices of the features where it stays any undefined value
na_indices_0_rem = get_na_columns(x_train_0_clean, 0, -999)
na_indices_1_rem = get_na_columns(x_train_1_clean, 0, -999)
na_indices_2_rem = get_na_columns(x_train_2_clean, 0, -999)

In [18]:
# Find a model to predict the undefined values using least squares
x_train_0_clean_pred, w_train_0 = predict_na_columns(x_train_0_clean, na_indices_0_rem)
x_train_1_clean_pred, w_train_1 = predict_na_columns(x_train_1_clean, na_indices_1_rem)
x_train_2_clean_pred, w_train_2 = predict_na_columns(x_train_2_clean, na_indices_2_rem)

In [19]:
# Predict the undefined value in the test set using the model obtained with the train set
x_test_0_clean_pred = set_predict_na_columns(x_test_0_clean, w_train_0[0], na_indices_0_rem)
x_test_1_clean_pred = set_predict_na_columns(x_test_1_clean, w_train_1[0], na_indices_1_rem)
x_test_2_clean_pred = set_predict_na_columns(x_test_2_clean, w_train_2[0], na_indices_2_rem)

In [20]:
# Standardize all train and test subsets. and add the intercept at the beginning
x_train_0_std, x_test_0_std = standardize(x_train_0_clean_pred, x_test_0_clean_pred)
x_train_0_std_int = np.column_stack((np.ones(x_train_0_std.shape[0]), x_train_0_std))
x_test_0_std_int = np.column_stack((np.ones(x_test_0_std.shape[0]), x_test_0_std))

x_train_1_std, x_test_1_std = standardize(x_train_1_clean_pred, x_test_1_clean_pred)
x_train_1_std_int = np.column_stack((np.ones(x_train_1_std.shape[0]), x_train_1_std))
x_test_1_std_int = np.column_stack((np.ones(x_test_1_std.shape[0]), x_test_1_std))

x_train_2_std, x_test_2_std = standardize(x_train_2_clean_pred, x_test_2_clean_pred)
x_train_2_std_int = np.column_stack((np.ones(x_train_2_std.shape[0]), x_train_2_std))
x_test_2_std_int = np.column_stack((np.ones(x_test_2_std.shape[0]), x_test_2_std))

**3. GET THE MODEL**

We get the model using ridge regression function with parameter lambda = $10^{-9}$, and a polynomial expansion of degree 3. Those parameters where obtained by previous computation.

In [27]:
# Polynomial expansion of degree 3 for the 3 train subsets
x_train_0_2 = build_poly(x_train_0_std_int, degree = 3)
x_train_1_2 = build_poly(x_train_1_std_int, degree = 3)
x_train_2_2 = build_poly(x_train_2_std_int, degree = 3)

In [28]:
# Get the model and the loss using ridge regression
w_0, loss_0 = ridge_regression(y_train_0, x_train_0_2, 10**(-9))
w_1, loss_1 = ridge_regression(y_train_1, x_train_1_2, 10**(-9))
w_2, loss_2 = ridge_regression(y_train_2, x_train_2_2, 10**(-9))

**4. PREDICT Y**

Finally we predict the y's.

In [29]:
# Polynomial expansion of degree 3 for the 3 test subsets
x_test_0_2 = build_poly(x_test_0_std_int, degree = 3)
x_test_1_2 = build_poly(x_test_1_std_int, degree = 3)
x_test_2_2 = build_poly(x_test_2_std_int, degree = 3)

In [30]:
# Get the prediction of y, bring back the values into the interval [0, 1] 
# using sigmoid function and transform the zero values into -1
y_0 = zero_to_neg(np.around(sigmoid(x_test_0_2 @ w_0)))
y_1 = zero_to_neg(np.around(sigmoid(x_test_1_2 @ w_1)))
y_2 = zero_to_neg(np.around(sigmoid(x_test_2_2 @ w_2)))

In [31]:
# Build back the predictions vector, concatenated with the ids
s_0 = np.column_stack((id_test_0, y_0))
s_1 = np.column_stack((id_test_1, y_1))
s_2 = np.column_stack((id_test_2, y_2))
s = np.vstack((np.vstack((s_0, s_1)), s_2))

In [32]:
# Sort the predicton by their id
y_pred = s[s[:,0].argsort()].astype(int)

**5. SAVE FILE**

In [38]:
create_csv_submission(y_pred[:,0], y_pred[:, 1], 'final_prediction.csv')