In [2]:
import numpy as np

from helpers import *
from implementations import ridge_regression
from functions import abs_dif, inv_log, mult 
from preprocessing import replace_invalid, standardize
from feature_eng import build_poly, build_x
from cross_validation import cross_validation_ridge
from predictions import predict_labels

In [3]:
y_train, x_brute_train, _ = load_csv_data("../data/train.csv")

In [4]:
_, x_brute_test, indices_test = load_csv_data("../data/test.csv")

In [5]:
y_train.shape, x_brute_train.shape, x_brute_test.shape

((250000,), (250000, 30), (568238, 30))

In [6]:
x_brute = np.concatenate((x_brute_train, x_brute_test))
x_brute.shape

(818238, 30)

In [7]:
train_size = x_brute_train.shape[0]
test_size = x_brute_test.shape[0]

train_size, test_size

(250000, 568238)

In [8]:
invalid_value = -999

In [9]:
features_name = ["DER_mass_MMC","DER_mass_transverse_met_lep","DER_mass_vis","DER_pt_h","DER_deltaeta_jet_jet","DER_mass_jet_jet","DER_prodeta_jet_jet","DER_deltar_tau_lep","DER_pt_tot","DER_sum_pt","DER_pt_ratio_lep_tau","DER_met_phi_centrality","DER_lep_eta_centrality","PRI_tau_pt","PRI_tau_eta","PRI_tau_phi","PRI_lep_pt","PRI_lep_eta","PRI_lep_phi","PRI_met","PRI_met_phi","PRI_met_sumet","PRI_jet_num","PRI_jet_leading_pt","PRI_jet_leading_eta","PRI_jet_leading_phi","PRI_jet_subleading_pt","PRI_jet_subleading_eta","PRI_jet_subleading_phi","PRI_jet_all_pt"]

In [10]:
PHI_features = [i for i, f in enumerate(features_name) if ("_phi" in f) and ("_phi_" not in f)]

PHI_features

[15, 18, 20, 25, 28]

# Conditioning on features 22

In [11]:
def verify_masks(masks):
    total = 0
    for mask in masks:
        num = mask.sum()
        print(num)
        total += num
    assert total == x_brute.shape[0]
    return len(masks)

In [12]:
data_masks = [
    x_brute[:, 22] == 0,
    x_brute[:, 22] == 1,
    x_brute[:, 22] > 1
]

        
verify_masks(data_masks)

327371
252882
237985


3

### Mask on Y

In [13]:
ys_train = [y_train[mask[:train_size]] for mask in data_masks]

[y.shape for y in ys_train]

[(99913,), (77544,), (72543,)]

### Mask on X

In [14]:
mask_phi_features = range_mask(30, PHI_features)

In [15]:
features_masks = [(x_brute[m].std(axis=0) != 0) & np.any(x_brute[m] != -999., axis=0) & ~mask_phi_features for m in data_masks]

In [16]:
xs_brute_train = [x_brute_train[d_m[:train_size]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]

[x.shape for x in xs_brute_train]

[(99913, 15), (77544, 18), (72543, 25)]

In [17]:
xs_brute_test = [x_brute_test[d_m[train_size:]][:, f_m] for d_m, f_m in zip(data_masks, features_masks)]

[x.shape for x in xs_brute_test]

[(227458, 15), (175338, 18), (165442, 25)]

In [18]:
lambdas = [5e-04, 1e-05, 5e-03]
k_fold = 5

n = len(data_masks)
degrees = [8, 11, 12]
roots = [3] * n
log_degrees = [5] * n
inv_log_degrees = [5] * n
fn_log = [True] * n
fn_inv_log = [True] * n
functions = [[mult, abs_dif],] * n

def build_ith_x(i):
     return build_x(xs_brute_train[i], xs_brute_test[i], degrees[i], roots[i],
                              log_degree=log_degrees[i], inv_log_degree=inv_log_degrees[i],
                              fn_log=fn_log[i], fn_inv_log=fn_inv_log[i], functions=functions[i], print_=True)

# Cross Validation

In [19]:
scores = []
ys_sub = []
for i in range(len(data_masks)):
    x_train, x_test = build_ith_x(i)
    print("x[{}] DONE".format(i))
    
    score = cross_validation_ridge(ys_train[i], x_train, k_fold, lambdas[i], seed=None)
    score *= 100
    scores.append(score)
    
    w, _ = ridge_regression(ys_train[i], x_train, lambdas[i])
    ys_sub.append(predict_labels(w, x_test))
    
    del x_train
    del x_test
    
scores

8 3 mf None 5 5 True True [<function mult at 0x7f9e600f5f28>, <function abs_dif at 0x7f9e600f5ea0>]
Starting pre-processing
Starting poly
Starting combinations


KeyboardInterrupt: 

In [None]:
[score.mean() for score in scores]

In [None]:
def avg_score(scores):
    return np.sum(np.array(scores).T.mean(axis=0) * np.array(data_masks).T.sum(axis=0)  / (train_size + test_size))

In [None]:
avg_score(scores)

# Params Tuning

In [None]:
i=2
lambda_ = 1e-04
seed = np.random.randint(1000000)
iters = 3

def none_if_in(a, seq):
    if a in seq:
        return None
    return a
#tuning = range(1, 15)
#tuning = [none_if_in(a, [1]) for a in range(1, 11)]
#tuning = [none_if_in(a, [0]) for a in range(0, 11)]
tuning = [False, True]
#tuning = [None, [mult], [abs_dif], [mult, abs_dif]]
param_tuning_scores = []

for idx, v in enumerate(tuning):
    degrees = [8, 11, 13]
    roots = [None, 6, 2]
    log_degrees = [1, 7, 1]
    inv_log_degrees = [1, 1, 1]
    fn_log = [True, True, False]
    fn_inv_log = [True, True, False]
    functions = [[mult, abs_dif], [mult, abs_dif], [mult, abs_dif]]
    
    x_train, _ = build_ith_x(i)
    scores = np.array([])
    for it in range(iters):
        score = cross_validation_ridge(ys_train[i], x_train, k_fold=4, lambda_=lambda_, seed=seed+i)
        scores = np.concatenate((scores, score))
    param_tuning_scores.append(scores)
    print("{}".format(x_train.shape[1]))
    print("{}".format(np.mean(scores)))


In [None]:
box_plot(np.array(param_tuning_scores).T, "13 2 mf None 1 1 False COMB-INV-LOG (mult,abs-dif)".replace(" ", "_"))

# HyperParams Tuning

In [None]:
hyper_tuning_scores = []
tuning_lambdas = np.logspace(-6, -1, num=11)
i = 0
seed = np.random.randint(1000000)
for lambda_ in tqdm(tuning_lambdas, ncols=100):
    x_train, _ = build_ith_x(i)
    score = cross_validation_ridge(ys_train[i], x_train, k_fold=4, lambda_=lambda_, seed=seed)
    testing_scores.append(score)

In [None]:
hyper_tuning_scores

 # Viz

In [None]:
plt.boxplot(np.array(scores).T)
plt.show()

In [None]:

import matplotlib.pyplot as plt
def box_plot(data, filename=None):
    plt.figure(figsize=(15,8))
    if filename != None:
        np.save("data/matrices/{}.npy".format(filename), data)
        plt.savefig("data/plots/{}.png".format(filename))
    
    plt.boxplot(np.array(param_tuning_scores).T, 0, '', showmeans=True)
    plt.show()


# Submission

In [None]:
y_submission = np.zeros(test_size)
for y, mask in zip(ys_sub, data_masks):
    mask = mask[train_size:]
    y_submission[mask] = y

In [None]:
create_csv_submission(indices_test, y_submission, "submissions/pred29.csv")