In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from implementations import *
from functions import *
from helper import *

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
y_test, tX_test, ids = load_csv_data(DATA_TEST_PATH)

## Do your thing crazy machine learning thing here :) ...

## 1. Test only for 0

In [None]:
i_PRI = 22

y_0 =  y[tX[:, i_PRI]==0]
tx_0 = tX[tX[:, i_PRI]==0]

In [None]:
tx_0_filtered = np.delete(tx_0, [4,5,6,12,22,23,24,25,26,27,28,29], axis=1)

In [None]:
tx_0_filtered[1,:]

In [None]:
def standardize(x, mean=None, std=None):
    """Standardize data set."""
    if mean is None:
        mean = np.nanmean(x, axis=0)
    x = x - mean

    if std is None:
        std = np.nanstd(x, axis=0)
    x = x / std
    return x, mean, std


def get_jet_index(x):
    """Get index of three groups."""
    jet0_index = np.where(x[:,22]==0)[0]
    jet1_index = np.where(x[:,22]==1)[0]
    jet2_index = np.where(x[:,22]>=2)[0]
    return [jet0_index, jet1_index, jet2_index]

def delta_angle_norm(a, b):
    """Caluculate difference between two angles
    normalize the result to ]-pi, pi]."""
    delta = a - b
    delta[delta < -np.pi] += 2 * np.pi
    delta[delta >  np.pi] -= 2 * np.pi
    return delta


def add_phi(x):
    """Add new phi features."""
    # PRI_lep_phi - PRI_tau_phi
    r1 = delta_angle_norm(x[:,18], x[:,15]).reshape(-1, 1)
    # PRI_met_phi - PRI_tau_phi
    r2 = delta_angle_norm(x[:,20], x[:,15]).reshape(-1, 1)
    # PRI_jet_leading_phi - PRI_tau_phi
    r3 = delta_angle_norm(x[:,25], x[:,15]).reshape(-1, 1)
    # PRI_jet_subleading_phi - PRI_tau_phi
    r4 = delta_angle_norm(x[:,28], x[:,15]).reshape(-1, 1)

    x = np.concatenate([x, r1, r2, r3, r4], axis=1)
    return x


def apply_log1p(x):
    """Apply log normalization to features with long tail."""
    long_tail = [0, 1, 2, 3, 5, 8, 9, 10, 13, 16, 19, 21, 23, 26, 29]
    x[:, long_tail] = np.log1p(x[:, long_tail])
    return x


def drop_useless(x):
    """Drop useless columns."""
    # raw angles
    # eta: 14, 17, 24, 27
    # phi: 15, 18, 20, 25, 28
    raw_angle = [15, 18, 20, 25, 28]
    # columns of the same value (std is 0)
    same_cols = list(np.where(np.nanstd(x, axis=0)==0)[0])
    # columns full of NaN
    nan_cols = list(np.where(np.all(np.isnan(x), axis=0))[0])

    to_drop = list(set(raw_angle+same_cols+nan_cols))
    x = np.delete(x, to_drop, axis=1)
    return x


def fill_missing(x):
    """Fill missing values."""
    # use nan as missing value
    x[x==-999] = np.nan
    return x


def fill_nan(x):
    """Fill nan values."""
    # fill nan with 0
    x = np.nan_to_num(x)

    # # fill nan with the most frequently elements
    # for i in range(x.shape[1]):
    #     xi = x[:, i]
    #     value, count = np.unique(xi, return_counts=True)
    #     mode = value[np.argmax(count)]
    #     xi[np.isnan(xi)] = mode
    return x


def preprocessing(x_train, x_test):
    """Preprocess data."""
    # fill missing values with nan
    x_train = fill_missing(x_train)
    x_test = fill_missing(x_test)

    # add new phi features
    x_train = add_phi(x_train)
    x_test = add_phi(x_test)

    # apply log normalization
    x_train = apply_log1p(x_train)
    x_test = apply_log1p(x_test)

    # drop useless columns
    x_train = drop_useless(x_train)
    x_test = drop_useless(x_test)

    # standardization
    x_train, mean, std = standardize(x_train)
    x_test, _, _ = standardize(x_test, mean, std)

    # fill nan
    x_train = fill_nan(x_train)
    x_test = fill_nan(x_test)

    return x_train, x_test

In [None]:
tX_p, aaa = preprocessing(tX, tX)

In [None]:
tX_p, tX_test = preprocessing(tX, tX_test)

In [None]:
feature_expansion(tx_0_filtered, 3).shape

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    
    test_indices = np.zeros(len(y)).astype(bool)
    test_indices[k_indices[k]] = True
    train_indices = (~test_indices).tolist()
    test_indices = test_indices.tolist()
        
    x_train = x[train_indices, :]
    y_train = y[train_indices]
    
    x_test = x[test_indices, :]
    y_test = y[test_indices]

    
    x_train_expanded = feature_expansion(x_train, degree)
    x_test_expanded = feature_expansion(x_test, degree)

    print("x_train_expanded", x_train_expanded)

    
    w = ridge_regression(y_train, x_train_expanded, lambda_)
    
    
    loss_tr = math.sqrt(2*compute_loss(y_train, x_train_expanded, w))
    
    loss_te = math.sqrt(2*compute_loss(y_test, x_test_expanded, w))
    
    classified = sum(predict_labels(w, x_test_expanded)==y_test)/len(y_test)
    

    return loss_tr, loss_te, classified

In [None]:
def cross_validation_advanced_demo(x, y):
    print("Dimensione x: ", x.shape)

    seed = 1
    degrees = [3]#np.arange(3,10)
    k_fold = 2
    #lambdas = np.logspace(-15, -8, 10)
    lambdas = [0]
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    

    tr_total = []
    te_total = []

    min_te = float('inf')
    min_degree = 0
    min_lambda = 0
    
    for degree in degrees : 
        
      #  print("vado con grado", degree)
        rmse_tr = []
        rmse_te = []
        rmse_classification = []
        rmse_tr_var = []
        rmse_te_var = []
    
        for lambda_ in lambdas : 
        #    print("vado con lambda", lambda_)


            loss_tr = []
            loss_te = []
            loss_classification = []

            for k in range(k_fold) : 
                tr, te, classified =  cross_validation(y, x, k_indices, k, lambda_, degree)
                loss_tr.append(tr)
                loss_te.append(te)
                loss_classification.append(classified)

            rmse_tr.append(np.mean(loss_tr))
            rmse_classification.append(np.mean(loss_classification))


            rmse_te.append(np.mean(loss_te))
            rmse_tr_var.append(np.var(loss_tr))
            rmse_te_var.append(np.var(loss_te))
            
            print("Grado", degree, ", lambda", lambda_, ", % giusti: ", np.mean(loss_classification))
            
            if np.mean(loss_te) < min_te : 
                min_te = np.mean(loss_te)
                min_degree = degree
                min_lambda = lambda_
    
        tr_total.append(rmse_tr)
        te_total.append(rmse_te)
        
        
    
    
    cross_validation_advanced_visualization(lambdas, degrees, tr_total, te_total)
    
    print("min_degree", min_degree)
    print("min_lambda", min_lambda)

    
    #cross_validation_advanced_visualization(lambdas, degrees, tr_total, te_total)
    return lambdas, degrees, tr_total, te_total



In [None]:
cross_validation_advanced_demo(tX_p, y_p)


In [None]:
lambdas, degrees, tr_total, te_total = cross_validation_advanced_demo(tX_p, y)

# Logistic regression

In [None]:

y_p = np.reshape(y, (len(y), 1))

Let's try with logistic regression. 

In [None]:
from feature_analysis import *

In [None]:
i_PRI = 22
y_jet0  = y[tX[:, i_PRI]==0]
tx_jet0 = tX[tX[:, i_PRI]==0]

y_jet1  = y[ tX[:, i_PRI] == 1]
tx_jet1 = tX[tX[:, i_PRI] == 1]

y_jet2  = y[ tX[:, i_PRI] > 1]
tx_jet2 = tX[tX[:, i_PRI] > 1]
#----------------------------------
#Then it can be executed like this
tx_0_filtered = np.delete(tx_jet0, [4,5,6,12,22,23,24,25,26,27,28], axis=1)
tx_1_filtered = np.delete(tx_jet1, [4,5,6,12,22,26,27,28], axis=1)
tx_2_filtered = np.delete(tx_jet2, [22], axis=1)

tx_0_filtered = fill_nan(fill_missing(tx_0_filtered))
tx_1_filtered = fill_nan(fill_missing(tx_1_filtered))
tx_2_filtered = fill_nan(fill_missing(tx_2_filtered))

tx_train_0 = featureExpand(tx_0_filtered, 0)
tx_train_1 = featureExpand(tx_1_filtered, 1)
tx_train_2 = featureExpand(tx_2_filtered, 2)

In [None]:
from logistic_regression import *

In [None]:
def logistic_regression_penalized_gradient_descent_demo(y, x):
    # init parameters
    max_iter = 500
    gamma = 0.01
    lambda_ = 0.1
    threshold = 1
    losses = []

    # build tx
    tx = np.c_[np.ones((y.shape[0], 1)), x]
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        classified = sum(predict_labels(w, tx)==y)/len(y)
        # log info
        if iter % 10 == 0:
            print("Current iteration={i}, loss={l}, classified={c}".format(i=iter, l=loss, c=classified))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    return w
    #visualization(y, x, mean_x, std_x, w, "classification_by_logistic_regression_penalized_gradient_descent",True)
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))

In [None]:
y_jet0r = y_jet0.reshape((len(y_jet0), 1))

In [None]:
w = logistic_regression_penalized_gradient_descent_demo(y_jet0r, tx_0_filtered)

In [None]:
tx = np.c_[np.ones((y.shape[0], 1)), tX_p]
classified = sum(predict_labels(w, tx)==y_p)/len(y_p)

In [None]:
classified = sum(predict_labels(w, tx_test)==y_p)/len(y_p)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)