In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from implementations import *
from helper import *
from feature_analysis import *
from expansions import polynomial_expansion

import pandas as pd

In [None]:
from proj1_helpers import *

DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

def prepare_y_data(y,x, i_PRI=22) : 
    y_jet0  = y[x[:, i_PRI]==0]
    y_jet1  = y[x[:, i_PRI] == 1]
    y_jet2  = y[x[:, i_PRI] > 1]

    y_jet0 = y_jet0.reshape((len(y_jet0), 1))
    y_jet1 = y_jet1.reshape((len(y_jet1), 1))
    y_jet2 = y_jet2.reshape((len(y_jet2), 1))

    return y_jet0, y_jet1, y_jet2


def prepare_x_data(x, i_PRI=22) : 
    tx_jet0 = x[x[:, i_PRI]==0]
    tx_jet1 = x[x[:, i_PRI] == 1]
    tx_jet2 = x[x[:, i_PRI] > 1]


    tx_0_filtered = np.delete(tx_jet0, [4,5,6,12,22,23,24,25,26,27,28], axis=1)
    tx_1_filtered = np.delete(tx_jet1, [4,5,6,12,22,26,27,28], axis=1)
    tx_2_filtered = np.delete(tx_jet2, [22], axis=1)

    tx_0_filtered = fix_nan_values(fix_missing_values(tx_0_filtered))
    tx_1_filtered = fix_nan_values(fix_missing_values(tx_1_filtered))
    tx_2_filtered = fix_nan_values(fix_missing_values(tx_2_filtered))

    tx_0 = featureExpand(tx_0_filtered, 0)
    tx_1 = featureExpand(tx_1_filtered, 1)
    tx_2 = featureExpand(tx_2_filtered, 2)
    return tx_0, tx_1, tx_2

In [None]:
db = pd.DataFrame(tx_train_0)

In [None]:
tx_train_0, tx_train_1, tx_train_2 = prepare_x_data(tX)
y_0, y_1, y_2 = prepare_y_data(y, tX)

In [None]:
from logistic_regression import *

In [None]:
def find_best_logistic_regression_model(y, tx, max_iter=20, gamma=0.05, k_fold=3) :
    lambdas = np.logspace(-10, 4, 15)
    degrees = np.arange(1, 20)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) : 
        expanded_tx = polynomial_expansion(tx, degree)
       # print(expanded_tx.shape)
        for i_lambda, lambda_  in enumerate(lambdas) : 

            results[i_degree, i_lambda] = cross_validate_logistic_regression(y, expanded_tx, max_iter, gamma, lambda_, k_fold)
            print("degree={d},\t lambda={l:e},\taccuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))

    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_logistic_regression_model(y_0, tx_train_0)

In [None]:
from least_squares import cross_validate_least_squares 

def find_best_least_squares_model(y, tx, k_fold=5) :
    lambdas = np.logspace(-10, 4, 15)
    degrees = np.arange(1, 20)

    results = np.zeros((len(degrees), len(lambdas)))

    for i_degree, degree in enumerate(degrees) :
        expanded_tx = polynomial_expansion(tx, degree)
        for i_lambda, lambda_  in enumerate(lambdas) : 
            try : 
                results[i_degree, i_lambda] = cross_validate_least_squares(y, expanded_tx, lambda_, k_fold)
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a=results[i_degree, i_lambda]))
            except np.linalg.LinAlgError : 
                results[i_degree, i_lambda] = 0
                print("degree={d},\t lambda={l:e},\t accuracy={a}".format(d=degree, l=lambda_, a="0 - singular matrix"))

    i,j = np.unravel_index(np.argmax(results, axis=None), results.shape)

    return degrees[i], lambdas[j], results[i, j]

In [None]:
find_best_least_squares_model(y_2, tx_train_2)

In [None]:
def find_best_model(y, tx) : 
    degree_logistic, lambda_logistic, acc_logistic = find_best_logistic_regression_model(y, tx)
    print("Logistic regression: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_logistic, l=lambda_logistic, a=acc_logistic))
    degree_ls, lambda_ls, acc_ls = find_best_least_squares_model(y, tx)
    print("Least squares: (degree: {d}, lambda: {l}, accuracy: {a})".format(d=degree_ls, l=lambda_ls, a=acc_ls))

    if acc_logistic > acc_ls : 
        return "logistic regression", degree_logistic, lambda_logistic, acc_logistic
    else : 
        return "least squares", degree_ls, lambda_ls, acc_ls

In [None]:
def find_model_for_higgs_dataset() : 
    
    jet_0_model, jet_0_degree, jet_0_lambda, jet_0_accuracy = find_best_model(y_0, tx_train_0)
    print("Jet0: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_0_model,d=jet_0_degree,l=jet_0_lambda,a=jet_0_accuracy))
    jet_1_model, jet_1_degree, jet_1_lambda, jet_1_accuracy = find_best_model(y_1, tx_train_1)
    print("Jet1: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_1_model,d=jet_1_degree,l=jet_1_lambda,a=jet_1_accuracy))
    jet_2_model, jet_2_degree, jet_2_lambda, jet_2_accuracy = find_best_model(y_2, tx_train_2)
    print("Jet2: model={m}, degree={d}, lambda={l}, accuracy={a}".format(m=jet_2_model,d=jet_2_degree,l=jet_2_lambda,a=jet_2_accuracy))


In [None]:
find_model_for_higgs_dataset()

### Generate predictions

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
models = {
    "jet0" : {
        "model" : "least squares",
        "degree" : 5,
        "lambda" : 1e-3
    },
    "jet1" : {
        "model" : "least squares",
        "degree" : 7,
        "lambda" : 1e-6
    },
    "jet2" : {
        "model" : "least squares",
        "degree" : 6,
        "lambda" : 1e-4
    }
}

In [None]:
def compute_weights(models) : 
    weights = []
    y = [y_0, y_1, y_2]  
    tx = [tx_train_0, tx_train_1, tx_train_2]
    for i, (jet, model) in enumerate(models.items()) : 
        x_expanded = polynomial_expansion(tx[i], model["degree"])
        if model["model"] == "least squares" : 
            w, err = ridge_regression(y[i], x_expanded, model["lambda"])
            weights.append(w)
        elif model["model"] == "logistic regression" : 
            w = logistic_regression_penalized_gradient_descent(y[i], x_expanded, 0.01, model["lambda"], 30)
            weights.append(w)
        else : 
            raise Exception("Model not recognised")
        print("weights computed for ", jet)
            
    return weights[0], weights[1], weights[2]

In [None]:
def predict(models, x_test) : 
    i_PRI = 22
    print("prepare dataset...")
    x_test_0, x_test_1, x_test_2 = prepare_x_data(x_test)
    print("compute weights...")
    w_0, w_1, w_2 = compute_weights(models)
    print("compute predictions...")
    x_test_0 = polynomial_expansion(x_test_0, models["jet0"]["degree"])
    x_test_1 = polynomial_expansion(x_test_1, models["jet1"]["degree"])
    x_test_2 = polynomial_expansion(x_test_2, models["jet2"]["degree"])

    y_0 = predict_labels(w_0, x_test_0)
    y_1 = predict_labels(w_1, x_test_1)
    y_2 = predict_labels(w_2, x_test_2)

    y_pred = np.zeros((len(x_test), 1))
    y_pred[x_test[:, i_PRI]==0] = y_0
    y_pred[x_test[:, i_PRI]==1] = y_1
    y_pred[x_test[:, i_PRI]>=2] = y_2

    return y_pred



In [None]:
OUTPUT_PATH="out2.csv"
y_pred = predict(models, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
