In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from data_processing import prepare_train_data
from expansions import polynomial_expansion
from logistic_regression import cross_validate_logistic_regression
from least_squares import cross_validate_least_squares
from plots import plot_degree_errors_plt


import disk_helpers

from proj1_helpers import *

JET_COLUMN = 22 # the column which we use to divide the dataset in sub-datasets

# Train dataset loading

In [None]:
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
y, tX, _ = load_csv_data(DATA_TRAIN_PATH)
y_jets, x_jets, replacing_values, means, stds = prepare_train_data(y, tX)

# Let's find the best model

`find_best_logistic_regression_model` and `find_best_least_squares_model` let us find the best model given `y` and `tx`.   
It is possible to change `lambdas` and `degrees` in the procedure definition to set the interval of training of the hyperparameters.   

A cross-validation with `k-fold` folds will be executed. The results will be saved on the disk and a graph will be plotted at the end of the computation. 

In [None]:
def find_best_logistic_regression_model(y, tx, jet_string, max_iter=100, gamma=0.05, k_fold=3) :
    """Computes the best logistic regression model for the given `y` and `tx`"""

    lambdas = np.logspace(-10, 4, 15)
    degrees = np.arange(1, 7)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) : 
        expanded_tx = polynomial_expansion(tx, degree)
        for i_lambda, lambda_  in enumerate(lambdas) : 
            results[i_degree, i_lambda] = cross_validate_logistic_regression(y, expanded_tx, max_iter, gamma, lambda_, k_fold)
            print("degree={d},\t lambda={l:e},\taccuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))

    disk_helpers.save_data('logistic-regression', jet_string, degrees, lambdas, results)
    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_logistic_regression_model(y_jets[0], x_jets[0], 'jet0')
find_best_logistic_regression_model(y_jets[1], x_jets[1], 'jet1')
find_best_logistic_regression_model(y_jets[2], x_jets[2], 'jet2')

In [None]:

def find_best_least_squares_model(y, tx, jet_string, k_fold=4) :
    """Computes the best least squares model for the given `y` and `tx`"""
    lambdas = np.logspace(-20, 3, 24)

    degrees = np.arange(1, 10)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) :
        expanded_tx = polynomial_expansion(tx, degree, mixed_columns=True)
        for i_lambda, lambda_  in enumerate(lambdas) : 
            try : 
                results[i_degree, i_lambda] = cross_validate_least_squares(y, expanded_tx, lambda_, k_fold)
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))
            except np.linalg.LinAlgError : 
                results[i_degree, i_lambda] = 0
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a="0 - singular matrix"))

    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)
    disk_helpers.save_data('least-squares', jet_string, degrees, lambdas, results)
    plot_degree_errors_plt(degrees, lambdas, results)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_least_squares_model(y_jets[0], x_jets[0], 'jet0')
find_best_least_squares_model(y_jets[1], x_jets[1], 'jet1')
find_best_least_squares_model(y_jets[2], x_jets[2], 'jet2')

To train every subset of the train dataset (divided by the column `jet`) it is possible to run `find_model_for_higgs_dataset`

In [None]:
def find_best_model(y, tx) : 
    """Finds the best model for the given `y` and `tx`"""
    degree_logistic, lambda_logistic, acc_logistic = find_best_logistic_regression_model(y, tx)
    print("Logistic regression: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_logistic, l=lambda_logistic, a=acc_logistic))
    degree_ls, lambda_ls, acc_ls = find_best_least_squares_model(y, tx)
    print("Least squares: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_ls, l=lambda_ls, a=acc_ls))

    if acc_logistic > acc_ls : 
        return "logistic regression", degree_logistic, lambda_logistic, acc_logistic
    else : 
        return "least squares", degree_ls, lambda_ls, acc_ls

In [None]:
def find_model_for_higgs_dataset() : 
    """Finds the best model for the entire train dataset"""
    
    jet_0_model, jet_0_degree, jet_0_lambda, jet_0_accuracy = find_best_model(y_jets[0], x_jets[0])
    print("Jet0: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_0_model,d=jet_0_degree,l=jet_0_lambda,a=jet_0_accuracy))
    jet_1_model, jet_1_degree, jet_1_lambda, jet_1_accuracy = find_best_model(y_jets[1], x_jets[1])
    print("Jet1: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_1_model,d=jet_1_degree,l=jet_1_lambda,a=jet_1_accuracy))
    jet_2_model, jet_2_degree, jet_2_lambda, jet_2_accuracy = find_best_model(y_jets[2], x_jets[2])
    print("Jet2: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_2_model,d=jet_2_degree,l=jet_2_lambda,a=jet_2_accuracy))


In [None]:
find_model_for_higgs_dataset()

# Generate predictions

In this section we generate a prediction on the test dataset given a model.   
A model is a dictionary like the ones shown below in which it is specified the model and the hyper-parameters to be used in each subset of the dataset. 

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
models_1 = {
    "jet0" : {
        "model" : "least squares",
        "degree" : 5,
        "lambda" : 1e-3,
        "mixed" : False,
        "accuracy" : 0
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-6,
        "mixed" : False,
        "accuracy" : 0.7976 # tx not mixed
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 6,
        "lambda" : 1e-4,
        "mixed" : False,
        "accuracy" : 0
    }
}

models_new = {
    "jet0" : {
        "model" : "least squares",
        "degree" : 3,
        "lambda" : 0.1,
        "mixed" : False,
        "accuracy" : 0.8355219697727956
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-5,
        "mixed" : False,
        "accuracy" : 0.8043203507866906  
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 5,
        "lambda" : 0.001,
        "mixed" : False,
        "accuracy" : 0.8258478081058727 
    }
}


models_0_for_Nans = {
 "jet0" : {
        "model" : "least squares",
        "degree" : 3,
        "lambda" : 1e-3,
        "mixed" : True,
        "accuracy" : 0.8473246753246754
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 4,
        "lambda" : 10000,
        "mixed" : True,
        "accuracy" : 0.79718  
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 2,
        "lambda" : 0.01,
        "mixed" : False,
        "accuracy" : 0.8307976908110867 
    }
}


models_oggi = {
 "jet0" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-5,
        "mixed" : False,
        "accuracy" : 0.8473246753246754
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-4,
        "mixed" : False,
        "accuracy" : 0.79718  
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-4,
        "mixed" : False,
        "accuracy" : 0.8307976908110867 
    }
}

models_statistics_31oct_15 =  {
    "jet0" : {
        "model" : "least squares",
        "degree" : 4,
        "lambda" : 1e-10,
        "mixed" : True,
        "accuracy" : 0.8470
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 6,
        "lambda" : 1e-3,
        "mixed" :  True,
        "accuracy" : 0.8069  
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 4,
        "lambda" : 1e-4,
        "mixed" : True,
        "accuracy" : 0.8342 
    }
}

In [None]:

def compute_weights(models) : 
    """Computes the weights for the given models"""
    weights = []
    y = [y_0, y_1, y_2]  
    tx = [tx_train_0, tx_train_1, tx_train_2]
    for i, (jet, model) in enumerate(models.items()) : 
        print("jet", i, ", degree: ", model["degree"], "lambda: ", model["lambda"], "expansion: ", model["mixed"])
        x_expanded = polynomial_expansion(tx[i], model["degree"], mixed_columns=model["mixed"])
        print("build model for jet", i)

        if model["model"] == "least squares" : 
            w, err = ridge_regression(y[i], x_expanded, model["lambda"])
            weights.append(w)
        elif model["model"] == "logistic regression" : 
            w = logistic_regression_penalized_gradient_descent(y[i], x_expanded, 0.01, model["lambda"], 30)
            weights.append(w)
        else : 
            raise Exception("Model not recognised")
        print("weights computed for", jet)
            
    return weights[0], weights[1], weights[2]
def predict(models, x_test) : 
    """Makes the prediction given the models chosen and the test dataset"""
    i_PRI = 22
    print("prepare dataset...")
    x_test_0, x_test_1, x_test_2 = prepare_test_data(x_test, means, stds, medians)
    print("compute weights...")
    w_0, w_1, w_2 = compute_weights(models)
    
    print("build matrices for predictions 0 ...")
    x_1 = polynomial_expansion(x_test_0, models["jet0"]["degree"], mixed_columns=models["jet0"]["mixed"])
    print("build matrices for predictions 1 ...")
    x_2 = polynomial_expansion(x_test_1, models["jet1"]["degree"], mixed_columns=models["jet1"]["mixed"])
    print("build matrices for predictions 2 ...")
    x_3 = polynomial_expansion(x_test_2, models["jet2"]["degree"], mixed_columns=models["jet2"]["mixed"])

    print("compute predictions...")

    y_0_predicted = predict_labels(w_0, x_1)
    y_1_predicted = predict_labels(w_1, x_2)
    y_2_predicted = predict_labels(w_2, x_3)

    y_pred = np.zeros((len(x_test), 1))
    y_pred[x_test[:, i_PRI]==0] = y_0_predicted
    y_pred[x_test[:, i_PRI]==1] = y_1_predicted
    y_pred[x_test[:, i_PRI]>=2] = y_2_predicted

    return y_pred

In [None]:
OUTPUT_PATH="out-check.csv"
y_pred = predict(models_statistics_31oct_15, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

# Estimation
Is it possible to run this code to estimate the accuracy of the given model on the test dataset without uploading a csv submission to AIcrowd. 

In [None]:
def estimation(models) : 
    """ Estimates the accuracy of our model on the test dataset without uploading to AIcrowd """
    i_PRI = 22
    n_0 = sum(tX_test[:, i_PRI]==0)
    n_1 = sum(tX_test[:, i_PRI]==1)
    n_2 = sum(tX_test[:, i_PRI]>=2)

    accuracy = (n_0*models["jet0"]["accuracy"] + n_1*models["jet1"]["accuracy"] + n_2*models["jet2"]["accuracy"])/(len(tX_test))

    print("The estimate accuracy with the given model is", round(accuracy, 5))

In [None]:
estimation(models_statistics_31oct_15)

# Plots

It is possible to use this code to plot the accuracy of the hyper-parameters for a trained model.

In [None]:
from disk_helpers import *
from plots import *

folder = "20211031-135405-jet0-least-squares"
lambdas = load_np_array(folder, "l")
degrees = load_np_array(folder, "d")
results = load_np_array(folder, "r")
plot_degree_errors_plt(degrees, lambdas, results)