In [1]:
import matplotlib.pyplot as plt
import numpy as np

from helpers import *
from implementations import ridge_regression
from functions import abs_dif, mult 
from feature_eng import build_x
from predictions import predict_labels

In [2]:
# Loading the data sets
y_train, x_brute_train, _ = load_csv_data("../data/train.csv")
_, x_brute_test, indices_test = load_csv_data("../data/test.csv")
x_brute = np.concatenate((x_brute_train, x_brute_test))
train_size = x_brute_train.shape[0]
test_size = x_brute_test.shape[0]

# Constants
PHI_features = [15, 18, 20, 25, 28]
invalid_value = -999

# Mask to subdivide in different models
# Mask for the data (rows)
data_masks = [
    x_brute[:, 22] == 0,
    x_brute[:, 22] == 1,
    x_brute[:, 22] > 1
]
num_models = len(data_masks)

# Mask for the features (columns)
features_masks = [(x_brute[m].std(axis=0) != 0) & np.any(x_brute[m] != -999., axis=0) & ~range_mask(30, PHI_features) for m in data_masks]

# Separate X and Y using the masks
ys_train = [y_train[mask[:train_size]] for mask in data_masks]
xs_brute_train = [x_brute_train[d_m[:train_size]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]
xs_brute_test = [x_brute_test[d_m[train_size:]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]

In [None]:
# FINAL PARAMETERS

degrees = [9, 11, 12]
roots = [3, 4, 3]
tanh_degrees = [3, 4, 3]
log_degrees = [5, 6, 5]
inv_log_degrees = [5, 6, 5]
fn_tanh = [True] * num_models
fn_log = [True] * num_models
fn_inv_log = [True] * num_models
fn_inv_log = [True] * num_models
functions = [[mult, abs_dif],] * num_models

def build_ith_x(i):
     return build_x(xs_brute_train[i], xs_brute_test[i], degrees[i], roots[i], log_degree=log_degrees[i], tanh_degree=tanh_degrees[i],
                    inv_log_degree=inv_log_degrees[i], fn_log=fn_log[i], fn_inv_log=fn_inv_log[i], fn_tanh=fn_tanh[i],
                    functions=functions[i], print_=True)

# PLOTS

In [None]:
def box_plot(data, filename=None):
    plt.figure(figsize=(15,8))
    if filename != None:
        np.save("matrices/{}.npy".format(filename), data)
        plt.savefig("plots/{}.png".format(filename))
    
    plt.boxplot(np.array(param_tuning_scores).T, 0, '', showmeans=True)
    plt.show()

## LAMBDA

In [None]:
model_selected = 0
lambdas_tuning = np.logspace(-8, -1, 15)

lambdas_tuning_scores = []
iters = 2
k_fold = 4

for lambda_ in lambdas_tuning:
    x_train, _ = build_ith_x(model_selected)
    scores = []
    for it in range(iters):
        score = cross_validation_ridge(ys_train[i], x_train, k_fold, lambda_, seed=100+it)
        scores.extend(score)
    lambdas_tuning_scores.append(scores)
    

In [None]:
box_plot(np.array(lambdas_tuning_scores).T, "lambdas_tuning_model-{}".format(model_selected))

## DEGREE

In [None]:
model_selected = 0
lambda_ = None

degrees_tuning = range(1, 15)
degrees_tuning_scores = []

for deg in degrees_tuning:
    degrees = [deg] * 3
    
    x_train, _ = build_ith_x(model_selected)
    scores = []
    for it in range(iters):
        score = cross_validation_ridge(ys_train[i], x_train, k_fold, lambda_, seed=100+it)
        scores.extend(score)
    degrees_tuning_scores.append(np.mean(scores))
    

In [None]:
box_plot(np.array(degrees_tuning_scores).T, "degrees_tuning_model-{}".format(model_selected))

## Different Models

In [None]:
# FINAL PARAMETERS

degrees = [9, 11, 12]
roots = [3, 4, 3]
tanh_degrees = [3, 4, 3]
log_degrees = [5, 6, 5]
inv_log_degrees = [5, 6, 5]
fn_tanh = [True] * num_models
fn_log = [True] * num_models
fn_inv_log = [True] * num_models
fn_inv_log = [True] * num_models
functions = [[mult, abs_dif],] * num_models

def build_ith_x(i):
     return build_x(xs_brute_train[i], xs_brute_test[i], degrees[i], roots[i], log_degree=log_degrees[i], tanh_degree=tanh_degrees[i],
                    inv_log_degree=inv_log_degrees[i], fn_log=fn_log[i], fn_inv_log=fn_inv_log[i], fn_tanh=fn_tanh[i],
                    functions=functions[i], print_=True)