In [49]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from implementations import *
from helper import *
from feature_analysis import *
from expansions import polynomial_expansion
import seaborn as sns
import pandas as pd #temporary
from logistic_regression import *
import disk_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from proj1_helpers import *

DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

def prepare_y_data(y,x, i_PRI=22) : 
    y_jet0  = y[x[:, i_PRI]==0]
    y_jet1  = y[x[:, i_PRI] == 1]
    y_jet2  = y[x[:, i_PRI] > 1]

    y_jet0 = y_jet0.reshape((len(y_jet0), 1))
    y_jet1 = y_jet1.reshape((len(y_jet1), 1))
    y_jet2 = y_jet2.reshape((len(y_jet2), 1))

    return y_jet0, y_jet1, y_jet2


def prepare_x_data(x, i_PRI=22) : 
    tx_jet0 = x[x[:, i_PRI]==0]
    tx_jet1 = x[x[:, i_PRI] == 1]
    tx_jet2 = x[x[:, i_PRI] > 1]


    tx_0_filtered = np.delete(tx_jet0, [4,5,6,12,22,23,24,25,26,27,28], axis=1)
    tx_1_filtered = np.delete(tx_jet1, [4,5,6,12,22,26,27,28], axis=1)
    tx_2_filtered = np.delete(tx_jet2, [22], axis=1)

    tx_0_filtered = fix_nan_values(fix_missing_values(tx_0_filtered))
    tx_1_filtered = fix_nan_values(fix_missing_values(tx_1_filtered))
    tx_2_filtered = fix_nan_values(fix_missing_values(tx_2_filtered))

    tx_0 = featureExpand(tx_0_filtered, 0)
    tx_1 = featureExpand(tx_1_filtered, 1)
    tx_2 = featureExpand(tx_2_filtered, 2)
    
    return tx_0, tx_1, tx_2

def remove_outliers(y, tx, keep=0.95) : 
    values_to_be_deleted = np.zeros(tx.shape[0]) # At first we keep all
    for column in range(tx.shape[1]) : 
        min_value = np.quantile(tx[:, column],(1-keep)/2)
        max_value = np.quantile(tx[:, column],(1+keep)/2)
        values_to_be_deleted = np.logical_or(values_to_be_deleted, np.logical_or(tx[:, column]<min_value, tx[:, column]>max_value))
    
    values_to_be_kept = np.logical_not(values_to_be_deleted)
    return y[values_to_be_kept, :], tx[values_to_be_kept, :]

In [5]:
tx_train_0, tx_train_1, tx_train_2 = prepare_x_data(tX)
y_0, y_1, y_2 = prepare_y_data(y, tX)

y_0, tx_train_0 = remove_outliers(y_0, tx_train_0)
y_1, tx_train_1 = remove_outliers(y_1, tx_train_1)
y_2, tx_train_2 = remove_outliers(y_2, tx_train_2)

# Some tests

In [None]:
db = pd.DataFrame(tx_train_0)
for column in db : 
    plt.figure()
    db.boxplot([column])


In [None]:
db.describe()

# Let's find the best model

In [None]:
def find_best_logistic_regression_model(y, tx, max_iter=20, gamma=0.05, k_fold=3) :
    """Computes the best logistic regression model for the given `y` and `tx`"""

    lambdas = np.logspace(-10, 4, 15)
    degrees = np.arange(1, 7)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) : 
        expanded_tx = polynomial_expansion(tx, degree)
        for i_lambda, lambda_  in enumerate(lambdas) : 
            results[i_degree, i_lambda] = cross_validate_logistic_regression(y, expanded_tx, max_iter, gamma, lambda_, k_fold)
            print("degree={d},\t lambda={l:e},\taccuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))

    disk_helpers.save_data('logistic-regression', degrees, lambdas, results)
    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_logistic_regression_model(y_0, tx_train_0)

In [54]:
from least_squares import cross_validate_least_squares
from plots import plot_degree_errors_plt


def find_best_least_squares_model(y, tx, jet_string, k_fold=7) :
    """Computes the best least squares model for the given `y` and `tx`"""
    lambdas = np.logspace(-4, 9, 9)
    degrees = np.arange(2,10)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) :
        print("build expanded...")
        expanded_tx = polynomial_expansion(tx, degree, mixed_columns=True)
        print("build model...")
        for i_lambda, lambda_  in enumerate(lambdas) : 
            try : 
                results[i_degree, i_lambda] = cross_validate_least_squares(y, expanded_tx, lambda_, k_fold)
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))
            except np.linalg.LinAlgError : 
                results[i_degree, i_lambda] = 0
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a="0 - singular matrix"))

    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)
    disk_helpers.save_data('least-squares', jet_string, degrees, lambdas, results)
    plot_degree_errors_plt(degrees, lambdas, results)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_least_squares_model(y_0, tx_train_0, 'jet0')
find_best_least_squares_model(y_1, tx_train_1, 'jet1')
find_best_least_squares_model(y_2, tx_train_2, 'jet2')

build expanded...
build model...
degree=2,	 lambda=1.000000e-04,	 accuracy=0.8464761904761906
degree=2,	 lambda=4.216965e-03,	 accuracy=0.8470129870129871
degree=2,	 lambda=1.778279e-01,	 accuracy=0.8467532467532467
degree=2,	 lambda=7.498942e+00,	 accuracy=0.84604329004329
degree=2,	 lambda=3.162278e+02,	 accuracy=0.8457489177489178
degree=2,	 lambda=1.333521e+04,	 accuracy=0.8444329004329004
degree=2,	 lambda=5.623413e+05,	 accuracy=0.842926406926407
degree=2,	 lambda=2.371374e+07,	 accuracy=0.8413852813852812
degree=2,	 lambda=1.000000e+09,	 accuracy=0.8363290043290043
build expanded...
build model...
degree=3,	 lambda=1.000000e-04,	 accuracy=0.7826839826839828
degree=3,	 lambda=4.216965e-03,	 accuracy=0.8472554112554113
degree=3,	 lambda=1.778279e-01,	 accuracy=0.847047619047619
degree=3,	 lambda=7.498942e+00,	 accuracy=0.846077922077922
degree=3,	 lambda=3.162278e+02,	 accuracy=0.8460259740259739
degree=3,	 lambda=1.333521e+04,	 accuracy=0.8450735930735931
degree=3,	 lambda=5.6234

In [None]:
def find_best_model(y, tx) : 
    """Finds the best model for the given `y` and `tx`"""
    degree_logistic, lambda_logistic, acc_logistic = find_best_logistic_regression_model(y, tx)
    print("Logistic regression: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_logistic, l=lambda_logistic, a=acc_logistic))
    degree_ls, lambda_ls, acc_ls = find_best_least_squares_model(y, tx)
    print("Least squares: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_ls, l=lambda_ls, a=acc_ls))

    if acc_logistic > acc_ls : 
        return "logistic regression", degree_logistic, lambda_logistic, acc_logistic
    else : 
        return "least squares", degree_ls, lambda_ls, acc_ls

In [None]:
def find_model_for_higgs_dataset() : 
    """Finds the best model for the entire train dataset"""
    
    jet_0_model, jet_0_degree, jet_0_lambda, jet_0_accuracy = find_best_model(y_0, tx_train_0)
    print("Jet0: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_0_model,d=jet_0_degree,l=jet_0_lambda,a=jet_0_accuracy))
    jet_1_model, jet_1_degree, jet_1_lambda, jet_1_accuracy = find_best_model(y_1, tx_train_1)
    print("Jet1: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_1_model,d=jet_1_degree,l=jet_1_lambda,a=jet_1_accuracy))
    jet_2_model, jet_2_degree, jet_2_lambda, jet_2_accuracy = find_best_model(y_2, tx_train_2)
    print("Jet2: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_2_model,d=jet_2_degree,l=jet_2_lambda,a=jet_2_accuracy))


In [None]:
find_model_for_higgs_dataset()

# Generate predictions

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
models = {
    "jet0" : {
        "model" : "least squares",
        "degree" : 5,
        "lambda" : 1e-3,
        "mixed" : False,
        "accuracy" : 0
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-6,
        "mixed" : False,
        "accuracy" : 0.7976 # tx not mixed
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 6,
        "lambda" : 1e-4,
        "mixed" : False,
        "accuracy" : 0
    }
}

models_new = {
    "jet0" : {
        "model" : "least squares",
        "degree" : 3,
        "lambda" : 0.1,
        "mixed" : False,
        "accuracy" : 0.8355219697727956
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-5,
        "mixed" : False,
        "accuracy" : 0.8043203507866906  
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 5,
        "lambda" : 0.001,
        "mixed" : False,
        "accuracy" : 0.8258478081058727 
    }
}

In [None]:
def compute_weights(models) : 
    """Computes the weights for the given models"""
    weights = []
    y = [y_0, y_1, y_2]  
    tx = [tx_train_0, tx_train_1, tx_train_2]
    for i, (jet, model) in enumerate(models.items()) : 
        x_expanded = polynomial_expansion(tx[i], model["degree"], model["mixed"])
        if model["model"] == "least squares" : 
            w, err = ridge_regression(y[i], x_expanded, model["lambda"])
            weights.append(w)
        elif model["model"] == "logistic regression" : 
            w = logistic_regression_penalized_gradient_descent(y[i], x_expanded, 0.01, model["lambda"], 30)
            weights.append(w)
        else : 
            raise Exception("Model not recognised")
        print("weights computed for", jet)
            
    return weights[0], weights[1], weights[2]

In [None]:
def predict(models, x_test) : 
    """Makes the prediction given the models chosen and the test dataset"""
    i_PRI = 22
    print("prepare dataset...")
    x_test_0, x_test_1, x_test_2 = prepare_x_data(x_test)
    print("compute weights...")
    w_0, w_1, w_2 = compute_weights(models)
    print("compute predictions...")
    x_test_0 = polynomial_expansion(x_test_0, models["jet0"]["degree"])
    x_test_1 = polynomial_expansion(x_test_1, models["jet1"]["degree"])
    x_test_2 = polynomial_expansion(x_test_2, models["jet2"]["degree"])

    y_0_predicted = predict_labels(w_0, x_test_0)
    y_1_predicted = predict_labels(w_1, x_test_1)
    y_2_predicted = predict_labels(w_2, x_test_2)

    y_pred = np.zeros((len(x_test), 1))
    y_pred[x_test[:, i_PRI]==0] = y_0_predicted
    y_pred[x_test[:, i_PRI]==1] = y_1_predicted
    y_pred[x_test[:, i_PRI]>=2] = y_2_predicted

    return y_pred

In [None]:
OUTPUT_PATH="out3.csv"
y_pred = predict(models_new, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

# Estimation
We can estimate the accuracy of our model on the test dataset without uploading data su AIcrowd

In [None]:
def estimation(models) : 
    """ Estimates the accuracy of our model on the test dataset without uploading to AIcrowd """
    i_PRI = 22
    n_0 = sum(tX_test[:, i_PRI]==0)
    n_1 = sum(tX_test[:, i_PRI]==1)
    n_2 = sum(tX_test[:, i_PRI]>=2)

    accuracy = (n_0*models["jet0"]["accuracy"] + n_1*models["jet1"]["accuracy"] + n_2*models["jet2"]["accuracy"])/(len(tX_test))

    print("The estimate accuracy with the given model is", round(accuracy, 5))

In [None]:
estimation(models_new)