# Imports

In [None]:
import csv
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
%load_ext autoreload
%autoreload 2

from helpers import *
from implementations import *

# Test Functions

In [None]:
from tests import ALL_TESTS, y_testing, tx_testing, initial_w_testing

for test in ALL_TESTS:
    try:
        test(y_testing(), tx_testing())
    except TypeError:
        test(y_testing(), tx_testing(), initial_w_testing())

# Loading Higgs Model data

In [None]:
from pathlib import Path
DATA = Path().resolve() / "data"
print("Looking for the data in", DATA)
y_test,  tx_test,  ids_test  = load_csv_data(DATA / "test.csv")
y_train, tx_train, ids_train = load_csv_data(DATA / "train.csv")

In [None]:
N, D = tx_train.shape

print(f'Number of samples: {N}')
print(f'Number of features: {D}')

# Visualizing the data

In [None]:
# normalize each feature
def normalize_features(x):
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

tx_normalised = normalize_features(tx_train)

In [None]:
def boxplot_every_feature(tx):
    plt.figure(figsize=(20, 10))
    plt.boxplot(tx)
    plt.xlabel('Feature')
    plt.ylabel('Deviation from the mean')
    plt.title("Boxplot of each feature")
    plt.show()

boxplot_every_feature(tx_normalised)

In [None]:
def plot_each_feature(x, y):
    """Plot the density distribution of every feature in one plot.
    The categories are represented by different colors."""
    size = int(np.ceil(np.sqrt(x.shape[1])))
    plt.figure(figsize=(20, 15))
    for i in range(x.shape[1]):
        plt.subplot(size, size, i+1)
        plt.hist(x[y == -1, i], bins=50, alpha=0.5, label='-1')
        plt.hist(x[y == 1, i], bins=50, alpha=0.5, label='1')
        plt.legend(loc='upper right')
        plt.title(f'Feature {i}')
    plt.show()
plot_each_feature(tx_normalised, y_train)

In [None]:
weirdness = np.sum(np.abs(tx_normalised), axis=1)
# histogram of weirdness
plt.figure(figsize=(20, 10))
plt.hist(weirdness, bins=100)
plt.xlabel('sum of deviations from the mean')
plt.ylabel('number of samples')
plt.title('Histogram of weirdness')
plt.show()

WEIRDNESS_THRESHOLD = 60
print(f'Number of samples with weirdness > {WEIRDNESS_THRESHOLD}: {np.sum(weirdness > WEIRDNESS_THRESHOLD)}')

tx_cleaner = tx_normalised[weirdness < WEIRDNESS_THRESHOLD]
y_cleaner = y_train[weirdness < WEIRDNESS_THRESHOLD]

In [None]:
tx_cleaner = normalize_features(tx_cleaner)

boxplot_every_feature(tx_cleaner)

### We now look into the correlations between the features

In [None]:
# compute the correlation between every pair of features
correlation_matrix = np.corrcoef(tx_cleaner.T)

# plot the correlation matrix
plt.figure()
plt.imshow(correlation_matrix, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Feature correlation matrix')
plt.show()

In [None]:
# Visually inspect the correlation between strongly correlated features
# (i.e. features with a correlation coefficient > 0.9)
from math import ceil

COR_THRESHOLD = 0.95  # or 0.99
strong_correlations = np.where(np.abs(correlation_matrix) > 0.95)

plt.figure(figsize=(20, 30))
indices = np.random.randint(0, tx_cleaner.shape[0], 1000)
nb_correlations = len(strong_correlations[0]) - D
nb = 0

groups = []
for i, j in zip(*strong_correlations):
    if i < j:
        for group in groups:
            if i in group or j in group:
                group.add(i)
                group.add(j)
                break
        else:
            groups.append({i, j})

        print(f'Correlation between feature {i} and feature {j}: {correlation_matrix[i, j]}')
        plt.subplot(ceil(nb_correlations / 4), 4, nb + 1)
        plt.scatter(tx_cleaner[indices, i], tx_cleaner[indices, j], c=y_cleaner[indices])
        plt.xlabel(f'Feature {i}')
        plt.ylabel(f'Feature {j}')
        plt.title(f'Correlation between {i} and {j}: {correlation_matrix[i, j]}')
        nb += 1
plt.show()

print(f'Number of groups of strongly correlated features: {len(groups)}')
print(f'Number of strongly correlated features: {sum(len(group) for group in groups)}')
print(f'Groups of strongly correlated features: {groups}')

In [None]:
# We remove the features that are strongly correlated together, 
# as they don't hold any additional information
to_delete = []
for group in groups:
    to_delete.extend(list(group)[1:])
print(f'Deleting features {to_delete}')
tx_clean = np.delete(tx_cleaner, to_delete, axis=1)

# Print the new correlation matrix
correlation_matrix = np.corrcoef(tx_clean.T)
plt.figure()
plt.imshow(correlation_matrix, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Feature correlation matrix')
plt.show()


# Binary classification

In [None]:
def split_data(x, y, ratio, seed=1):
    """Split the dataset between train and test based on the split ratio."""
    np.random.seed(seed)
    
    n = len(y)
    indices = np.random.permutation(n)
    split = int(ratio * n)
    train_indices, test_indices = indices[:split], indices[split:]
    return x[train_indices], y[train_indices], x[test_indices], y[test_indices]

x_train, y_train, x_test, y_test = split_data(tx_clean, y_cleaner, 0.8)

In [None]:
mean_squarred_error_sgd(y_train, x_train, np.randn(x_train.shape[1]))

## Logistic Regression

In [None]:
N_test = len(y_test)

y_test = np.reshape(y_test, (N_test, 1))

In [None]:
N, D = tx_train.shape

y_train = np.reshape(y_train, (N, 1))
max_iter = 150
gamma = 0.1
initial_weights = np.zeros((D, 1))

weights, loss = logistic_regression(y_train, tx_train, initial_weights, max_iter, gamma)

print(f'Test loss: {calculate_loss(y_test, tx_test, weights)}')

In [None]:
## Dunno how to visualize this

## Other ideas: hyperparameter search for gamma; OR gamma function that decreases over time

In [None]:
print(f'[Logistic Regression] In max_iter={max_iter}, with hyperparameter gamma={gamma} we obtain a loss={loss}')

## Regularized Logistic Regression

## K-fold cross validation

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def cross_validation(y, x, k_indices, k, lambda_):
    """return the loss of ridge regression for a fold corresponding to k_indices
    
    Args:
        y:          shape=(N,)
        x:          shape=(N,D)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        lambda_:    scalar, cf. ridge_regression()
        degree:     scalar, cf. build_poly()

    Returns:
        train and test root mean square errors rmse = sqrt(2 mse)

    >>> cross_validation(np.array([1.,2.,3.,4.]), np.array([6.,7.,8.,9.]), np.array([[3,2], [0,1]]), 1, 2, 3)
    (0.019866645527597114, 0.33555914361295175)
    """
    
    N, D = x.shape

    # ***************************************************
    # get k'th subgroup in test, others in train
    # ***************************************************
    k_te_indices = k_indices[k]
    te_mask = np.zeros(N, dtype = bool)
    te_mask[k_te_indices] = True
    
    y_te = y[te_mask]
    y_tr = y[~te_mask]
    
    x_te = x[te_mask]
    x_tr = x[~te_mask]
    
    # ***************************************************
    # Regularized logistic regression
    # ***************************************************
    
    initial_weights = np.zeros((D, 1))
    gamma = 0.1
    max_iters = 150
    weights, loss = reg_logistic_regression(y_tr, x_tr, lambda_, initial_weights, max_iters, gamma)
    
    # ***************************************************
    # calculate the loss for train and test data
    # ***************************************************
    
    loss_tr = np.sqrt(2 * compute_mse(y_tr, x_tr, weights))
    loss_te = np.sqrt(2 * compute_mse(y_te, x_te, weights))
    
    return loss_tr, loss_te

def cross_validation_demo(y, x, k_fold, lambdas):
    """cross validation over regularisation parameter lambda.
    
    Args:
        degree: integer, degree of the polynomial expansion
        k_fold: integer, the number of folds
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_lambda : scalar, value of the best lambda
        best_rmse : scalar, the associated root mean squared error for the best lambda
    """
    
    seed = 12
    k_fold = k_fold
    lambdas = lambdas
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    # ***************************************************
    # cross validation over lambdas
    # ***************************************************
    
    for lambda_ in lambdas:
        aux_tr = 0; aux_te = 0
        for k in np.arange(k_fold):
            loss_tr_tmp, loss_te_tmp = cross_validation(y, x, k_indices, k, lambda_)
            
            aux_tr += loss_tr_tmp
            aux_te += loss_te_tmp
            
        rmse_tr.append(aux_tr/k_fold)
        rmse_te.append(aux_te/k_fold)   

    ## Computing the best lambda & test rmse tuple
    best_idx = np.argmin(rmse_te)
    
    best_lambda = lambdas[best_idx]
    best_rmse   = rmse_te[best_idx]
        
    print("The choice of lambda which leads to the best test rmse is %.5f with a test rmse of %.3f" % (best_lambda, best_rmse))
    return best_lambda, best_rmse

##Fn call
best_lambda, best_rmse = cross_validation_demo(y_train, tx_train, 4, np.logspace(-4, 0, 30))

In [None]:
def best_param_selection(y, x, gammas, k_fold, lambdas, seed = 1):
    """cross validation over regularisation parameter lambda and gradient descent step gamma.
    
    Args:
        gammas: shape = (d,), where d is the number of values of gamma to test 
        k_fold: integer, the number of folds
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_gamma  : scalar, value of the best gamma
        best_lambda : scalar, value of the best lambda
        best_rmse : value of the rmse for the couple (best_gamma, best_lambda)
    """
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # ***************************************************
    # cross validation over degrees and lambdas
    # ***************************************************
    
    rmse_matrix_tr = []
    rmse_matrix_te = []
    
    for lambda_ in lambdas:
        rmse_tr = []; rmse_te = []
        
        for degree in degrees:
            aux_tr = 0; aux_te = 0
            
            for k in np.arange(k_fold):
                loss_tr_tmp, loss_te_tmp = cross_validation(y, x, k_indices, k, lambda_, gamma)
            
                aux_tr += loss_tr_tmp
                aux_te += loss_te_tmp
            ##end for-k
            rmse_tr.append(aux_tr/k_fold)
            rmse_te.append(aux_te/k_fold)
        ##end for-deg
        rmse_matrix_tr.append(rmse_tr)
        rmse_matrix_te.append(rmse_te)
    ##end for-lambda
    
    best_idx = np.argmin(rmse_matrix_te)
    
    l_idx = best_idx // len(lambdas)
    d_idx = best_idx % len(degrees)
    
    best_lambda = lambdas[l_idx]
    best_degree = degrees[d_idx]
    best_rmse = rmse_matrix_te[l_idx][d_idx]
        
    return best_degree, best_lambda, best_rmse

In [None]:
lambda_ = 0.05
initial_weight = np.zeros((D,1))
gamma = 0.1
max_iter = 150

weights, loss = reg_logistic_regression(y_train, tx_train, lambda_, initial_weight, max_iter, gamma)

print(calculate_loss(y_test, tx_test, weights))

In [None]:
print(f'[Reg. Logistic Regression] In max_iter={max_iter}, with hyperparameters lambda={lambda_} and gamma={gamma} we obtain a loss={loss} and a test loss={5} TODO')