# REGULARIZED LOGISTIC REGRESSION

### imports

In [24]:
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from implementations import *
from IPython.core.display import display, HTML

%matplotlib inline
%load_ext autoreload
%autoreload 2
display(HTML("<style>.container { width:95% !important; }</style>"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### constants definition

In [25]:
#constants definitions
PRI_JET_NUM_IDX = 22   
PRI_JET_NUM_VALUES = range(4)
NUMBER_GROUPS = len(PRI_JET_NUM_VALUES)
NBR_FEATURES = 30
UNDEFINED_VALUE = -999.

### training data : loading and feature engineering

In [26]:
#loading the training data
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

#seperating the data within the four groups (with respect to the jet_number)
jet_groups_indices = [tX[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_arr = [tX[group_indices] for group_indices in jet_groups_indices]
Y_arr, TX_arr = zip(*[(y[group_indices], tX[group_indices]) for group_indices in jet_groups_indices])
Y_arr, TX_arr = list(Y_arr), list(TX_arr)

#collecting the indices of the undefined features for each group
undefined_features = [[], [], [], []]
for group_idx in range(NUMBER_GROUPS):
    tx = TX_arr[group_idx]
    for feature_idx in range(NBR_FEATURES):
        feature_column = tx[:, feature_idx]
        if np.all(feature_column == UNDEFINED_VALUE):
            undefined_features[group_idx].append(feature_idx)

#computing the std of the features for each group
STDS = [np.std(TX_arr[i], axis = 0) for i in range(NUMBER_GROUPS)]

#collecting the indices of the features with no variance (i.e. constant features) within each groups
cst_features = [[], [], [], []]
for group_idx, elem in enumerate(STDS):
    for feature_idx, std in enumerate(elem):
        if std == 0. and feature_idx not in undefined_features[group_idx]:
            cst_features[group_idx].append(feature_idx)

#deleting the features either undefined or with no variance (i.e. constant features) within each groups
features_to_keep = ([[x for x in range(NBR_FEATURES) 
                      if x not in undefined_features[group_idx] and x not in cst_features[group_idx]] 
                      for group_idx in range(NUMBER_GROUPS)])
TX_arr = [TX_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#computing the median of each feature and substituting it instead of undefined values
train_medians = []
for group_idx in range(NUMBER_GROUPS):
    medians = np.apply_along_axis(lambda v: np.median(v[v!=UNDEFINED_VALUE]), 0, TX_arr[group_idx])
    train_medians.append(medians)
    for col_num in range(TX_arr[group_idx].shape[1]):
        column = TX_arr[group_idx][:, col_num]
        column[column == UNDEFINED_VALUE] = medians[col_num]

#standardizing the data
def standardize(x):
    centered_data = x - np.mean(x, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    return std_data

TX_arr = [standardize(TX_arr[idx]) for idx in range(NUMBER_GROUPS)]

### test data : loading and feature engineering

In [27]:
#loading the test data 
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

#seperating the data within the four groups (with respect to the jet_number)
jet_groups_indices_test = [tX_test[:, PRI_JET_NUM_IDX] == pri_jet_num_value for pri_jet_num_value in PRI_JET_NUM_VALUES]
TX_test_arr = list([tX_test[group_indices] for group_indices in jet_groups_indices_test])

#removing unused features (using the indices found during the processing of the training data)
TX_test_arr = [TX_test_arr[group_idx][:, features_to_keep[group_idx]] for group_idx in range(NUMBER_GROUPS)]

#replacing the the undefined values by the median of the corresponding feature
for group_idx in range(NUMBER_GROUPS):
    for col_num in range(TX_test_arr[group_idx].shape[1]):
        column = TX_test_arr[group_idx][:, col_num]
        column[column == UNDEFINED_VALUE] = train_medians[group_idx][col_num]

#standardizing the data
TX_test_arr = [standardize(TX_test_arr[idx]) for idx in range(NUMBER_GROUPS)]

### best parameters selection : cross-validation

In [29]:
%%time

seed = 15
#degrees= [1, 2, 6]
#k_fold = 4
#max_iters = 2_000
#lambdas = np.logspace(-7, 2, 5)
#gammas = np.logspace(-6, -2, 5)
    
degrees= [2]
k_fold = 4
max_iters = 2_000
lambdas =[1e-3]
gammas = [1e-2]   
    
    
PARAM_arr = []

for group_idx in range(NUMBER_GROUPS):
    y=np.array(Y_arr[group_idx])
    y[y == - 1.0] = 0.0
    tX=np.array(TX_arr[group_idx])
    initial_w = np.zeros(len(features_to_keep[group_idx]))
    degree, gamma, lambda_ = cross_validation_demo_reg_logistic(y, tX, max_iters, seed, degrees, k_fold, lambdas, gammas)
    PARAM_arr.append((degree, lambda_))
    print(f" ---> for group {group_idx}, the obtained best degree is {degree} and lambda is {lambda_}")

    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[952925.44361433]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[1068939.12836857]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[972891.60358377]]
    Current iteration=0, loss=[[51940.29082808]]
    Current iteration=1000, loss=[[975340.65282257]]
 ---> for group 0, the obtained best degree is 2 and lambda is 0.001
    Current iteration=0, loss=[[40312.05372701]]
    Current iteration=1000, loss=[[813021.77125258]]
    Current iteration=0, loss=[[40312.05372701]]
    Current iteration=1000, loss=[[779704.38813297]]
    Current iteration=0, loss=[[40312.05372701]]
    Current iteration=1000, loss=[[792154.05662968]]
    Current iteration=0, loss=[[40312.05372701]]
    Current iteration=1000, loss=[[779277.89327428]]
 ---> for group 1, the obtained best degree is 2 and lambda is 0.001
    Current iteration=0, loss=[[26188.4867759

### training and generating the predictions

In [21]:
PARAM_arr = [(1, 0.001, 0.01), (1, 0.01, 0.001), (2, 0.01, 0.01), (1, 0.01, 0.01)] 

In [22]:
#training model and generating the predictions for each group
y_pred = np.empty(tX_test.shape[0])
for group_idx in range(NUMBER_GROUPS):
    #training
    max_iters = 5_000
    degree = PARAM_arr[group_idx][0]
    gamma = PARAM_arr[group_idx][1]
    lambda_ = PARAM_arr[group_idx][2]
    tx_train = build_poly(TX_arr[group_idx], degree)
    y_train = Y_arr[group_idx]
    y_train[y_train == -1.0] = 0.0
    initial_w = np.zeros((tx_train.shape[1], 1))  
    weight, loss = reg_logistic_regression(y_train, tx_train, lambda_, initial_w, max_iters, gamma)
    
    #prediction
    tx_test = build_poly(TX_test_arr[group_idx], PARAM_arr[group_idx][0])
    y_pred[jet_groups_indices_test[group_idx]] = predict_labels(weight, tx_test).flatten()

#creating csv file
OUTPUT_PATH = '../data/sample-submission_regularized_with_standardization.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

    Current iteration=0, loss=[[69254.41425129]]


  return 1.0 / (1 + np.exp(-t))


    Current iteration=1000, loss=[[549630.13489449]]
    Current iteration=2000, loss=[[549630.13375641]]
    Current iteration=3000, loss=[[549630.13375386]]
    Current iteration=4000, loss=[[549630.13375298]]
    Current iteration=0, loss=[[53749.40496934]]
    Current iteration=1000, loss=[[2493429.54409721]]
    Current iteration=2000, loss=[[2493429.54407204]]
    Current iteration=3000, loss=[[2493429.54406212]]
    Current iteration=4000, loss=[[2493429.54405241]]
    Current iteration=0, loss=[[34920.06180943]]
    Current iteration=1000, loss=[[1898403.40528716]]
    Current iteration=2000, loss=[[1898462.85541471]]
    Current iteration=3000, loss=[[1914180.40885871]]
    Current iteration=4000, loss=[[1922592.13204248]]
    Current iteration=0, loss=[[15362.91410993]]
    Current iteration=1000, loss=[[658573.11190147]]
    Current iteration=2000, loss=[[658573.10509506]]
    Current iteration=3000, loss=[[658573.10509496]]
    Current iteration=4000, loss=[[658573.10509496