In [1]:
import numpy as np

from helpers import *
from implementations import ridge_regression
from cross_validation import cross_validation_ridge
from functions import abs_dif, mult
from feature_eng import build_x
from predictions import predict_labels

In [2]:
# Loading the data sets
y_train, x_brute_train, _ = load_csv_data("../data/train.csv")
_, x_brute_test, indices_test = load_csv_data("../data/test.csv")
x_brute = np.concatenate((x_brute_train, x_brute_test))
train_size = x_brute_train.shape[0]
test_size = x_brute_test.shape[0]

# Constants
PHI_features = [15, 18, 20, 25, 28]
invalid_value = -999

# Mask to subdivide in different models
# Mask for the data (rows)
data_masks = [
    x_brute[:, 22] == 0,
    x_brute[:, 22] == 1,
    x_brute[:, 22] > 1
]
num_models = len(data_masks)

# Mask for the features (columns)
features_masks = [(x_brute[m].std(axis=0) != 0) & np.any(x_brute[m] != -999., axis=0) & ~range_mask(30, PHI_features) for m in data_masks]

# Separate X and Y using the masks
ys_train = [y_train[mask[:train_size]] for mask in data_masks]
xs_brute_train = [x_brute_train[d_m[:train_size]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]
xs_brute_test = [x_brute_test[d_m[train_size:]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]

In [21]:
lambdas = [5e-04, 1e-05, 5e-03]
k_fold = 4

num_models = len(data_masks)
# Models variables
degrees = [9, 11, 12]
roots = [3, 4, 3]
tanh_degrees = [3, 4, 3]
log_degrees = [3, 4, 3]
inv_log_degrees = [3, 4, 3]
fn_tanh = [True] * num_models
fn_log = [False] * num_models
fn_inv_log = [True] * num_models
functions = [[mult, abs_dif],] * num_models

def build_ith_x(i):
     return build_x(xs_brute_train[i], xs_brute_test[i], degrees[i], roots[i], log_degree=log_degrees[i], tanh_degree=tanh_degrees[i],
                    inv_log_degree=inv_log_degrees[i], fn_log=fn_log[i], fn_inv_log=fn_inv_log[i], fn_tanh=fn_tanh[i],
                    functions=functions[i], print_=True)

# Cross Validation

In [22]:
final_scores = []
ys_sub = []
iters = 2
for i in range(len(data_masks)):
    x_train, x_test = build_ith_x(i)
    print("x[{}] DONE".format(i))

    w, _ = ridge_regression(ys_train[i], x_train, lambdas[i])
    ys_sub.append(predict_labels(w, x_test))
    
    scores = []
    for it in range(iters):
        score = cross_validation_ridge(ys_train[i], x_train, k_fold, lambdas[i], seed=100+it)
        score *= 100
        scores.append(score)
        
    final_scores.append(np.mean(scores))


    del x_train
    del x_test

final_scores

9 3 3 3 3 False True True [<function mult at 0x7fac3c0adae8>, <function abs_dif at 0x7fac3c0ada60>]
Starting pre-processing
Starting poly
Starting combinations
Final shape: (327371, 2281)
x[0] DONE
11 4 4 4 4 False True True [<function mult at 0x7fac3c0adae8>, <function abs_dif at 0x7fac3c0ada60>]
Starting pre-processing
Starting poly
Starting combinations
Final shape: (252882, 3331)
x[1] DONE
12 3 3 3 3 False True True [<function mult at 0x7fac3c0adae8>, <function abs_dif at 0x7fac3c0ada60>]
Starting pre-processing
Starting poly
Starting combinations
Final shape: (237985, 6126)
x[2] DONE


[85.164444711345979, 81.747137109254112, 85.027570995312928]

In [23]:
final_scores

[85.164444711345979, 81.747137109254112, 85.027570995312928]

In [24]:
def avg_score(scores):
    return (np.array(final_scores) * np.array(data_masks).T.sum(axis=0)  / (train_size + test_size)).sum()

In [25]:
avg_score(final_scores)

84.068492833845397

# Submission

In [26]:
y_submission = np.zeros(test_size)
for y, mask in zip(ys_sub, data_masks):
    mask = mask[train_size:]
    y_submission[mask] = y
    
create_csv_submission(indices_test, y_submission, "submissions/final_sub_03.csv")