In [None]:
import numpy as np
import itertools as it

from helpers import *

from tqdm import tqdm
from implementations import *
from cross_validation import build_k_fold_sets
from feature_eng import *
from data_cleaning import replace_invalid
from plots import display_features

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
y_train, x_brute_train, _ = load_csv_data("../data/train.csv")

In [None]:
y_test, x_brute_test, _ = load_csv_data("../data/test.csv")

In [None]:
x_brute = np.concatenate((x_brute_train, x_brute_test))
x_brute.shape

In [5]:
mask_invalid = x_brute == -999.0
x_replaced_invalid = replace_invalid(x_brute, mask_invalid, replace_by="mean")
x_replaced_invalid.shape

(818238, 30)

In [6]:
def verify_masks(masks):
    total = 0
    for mask in masks:
        total += mask.sum()
    assert total == x_brute.shape[0]
    return len(masks)

In [7]:
x_min_values = np.ma.array(x_brute, mask=mask_invalid).min(axis=0)
x_min_values.shape

(30,)

In [8]:
limit = 2.7
masks_feat_8 = [
    np.log(x_replaced_invalid - x_min_values + 1)[:, 8] <= limit,
    np.log(x_replaced_invalid - x_min_values + 1)[:, 8] > limit
]

verify_masks(masks_feat_8)

2

In [9]:
masks_feat_22 = [
    x_replaced_invalid[:, 22] == 0,
    x_replaced_invalid[:, 22] == 1,
    x_replaced_invalid[:, 22] > 1
]

verify_masks(masks_feat_22)

3

In [10]:
data_masks = []
for mask_i in masks_feat_8:
    for mask_j in masks_feat_22:
        data_masks.append(mask_i & mask_j)
        print((mask_i & mask_j).sum())
        
verify_masks(data_masks)

192086
133623
92524
135285
119259
145461


6

In [11]:
invalid_features_mask = [mask_invalid[mask].sum(axis=0) != mask.sum() for mask in data_masks]

In [12]:
xs_replaced_invalid = []
xs_features_mask = []

for idx, mask in enumerate(data_masks):
    # remove features containing only invalid features
    mask_1 = invalid_features_mask[idx]
    # & remove features with std being 0 (because of conditioning over feature 22 f.ex.)
    mask_2 =  x_replaced_invalid[mask].std(axis=0) != 0
    
    features_mask = mask_1 & mask_2
    
    xs_features_mask.append(features_mask)
    xs_replaced_invalid.append(x_replaced_invalid[mask][:, features_mask])
    
[x.shape for x in xs_replaced_invalid]

[(192086, 18),
 (133623, 22),
 (92524, 30),
 (135285, 18),
 (119259, 22),
 (145461, 30)]

In [14]:
xs_brute = [x_brute[data_mask][:, f_mask] for data_mask, f_mask in zip(data_masks, xs_features_mask)]

[x.shape for x in xs_brute]

[(192086, 18),
 (133623, 22),
 (92524, 30),
 (135285, 18),
 (119259, 22),
 (145461, 30)]

In [61]:
xs_log = []

for x in xs_replaced_invalid:
    mask_negative_values = ~np.any(x <= 0, axis=0)
    non_negative_features = x[:, mask_negative_values]

    xs_log.append(np.log(non_negative_features))
    
[f.shape for f in xs_log]

[(192086, 10),
 (133623, 12),
 (92524, 16),
 (135285, 12),
 (119259, 12),
 (145461, 16)]

In [64]:
xs_mix = [np.concatenate((x1, x2), axis=1) for x1, x2 in zip(xs_replaced_invalid, xs_log)]

[f.shape for f in xs_mix]

[(192086, 28),
 (133623, 34),
 (92524, 46),
 (135285, 30),
 (119259, 34),
 (145461, 46)]

In [66]:
xs_standardized = [standardize(x) for x in xs_mix]

[x.shape for x in xs_standardized]

[(192086, 28),
 (133623, 34),
 (92524, 46),
 (135285, 30),
 (119259, 34),
 (145461, 46)]

In [67]:
degrees =  [11] * len(data_masks)
poly_features = [build_poly(x, degree) for x, degree in tqdm(zip(xs_standardized, degrees), ncols=100)]

[f.shape for f in poly_features]

6it [00:26,  4.49s/it]


[(192086, 336),
 (133623, 408),
 (92524, 552),
 (135285, 360),
 (119259, 408),
 (145461, 552)]

In [68]:
combinations_over = xs_standardized

combinations_features = []
for x in tqdm(combinations_over, ncols=100):
    combinations = []
    for i in range(x.shape[1]):
        for j in range(x.shape[1]):
            if i != j:
                comb = (x[:, i] * x[:, j]).reshape((x.shape[0], 1))
                combinations.append(comb)
    combinations_features.append(np.concatenate(combinations, axis=1))


[f.shape for f in combinations_features]

100%|█████████████████████████████████████████████████████████████████| 6/6 [01:13<00:00, 12.31s/it]


[(192086, 756),
 (133623, 1122),
 (92524, 2070),
 (135285, 870),
 (119259, 1122),
 (145461, 2070)]

In [73]:
train_size, test_size = x_brute_train.shape[0], x_brute_test.shape[0]
train_mask = np.r_[[True] * train_size, [False] * test_size]
xs_train_size = [(mask & train_mask).sum() for mask in data_masks]

[size for size in xs_train_size]

[58823, 40904, 28236, 41090, 36640, 44307]

# Parameters tuning

In [108]:
def to_csv(scores, params, filename):    
    with open(filename, "a", encoding="utf8") as output_file:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerows(zip(scores, params))

In [122]:
remember_n = 5
best_scores = [0] * remember_n
best_params = [None] * remember_n

def reset_n_best(filename):
    to_csv(best_scores, best_params, filename)
    best_scores = [0] * remember_n
    best_params = [None] * remember_n
    
def remember_n_best(score, params):    
    min_score = np.array(best_scores).min()
    if min_score < score:
        idx = best_scores.index(min_score)
        best_scores[idx] = score
        best_params[idx] = params
        #print("new high score = {s} with params = {p}".format(s=score, p=params))
        return True
    return False

In [113]:
def cross_validation_ridge(y_train, x_train, k_fold, lambda_, seed=np.random.seed()):
    test_losses = []
    for x_tr, x_va, y_tr, y_va in build_k_fold_sets(y_train, x_train, k_fold, seed):
        w = ridge_regression(y_tr, x_tr, lambda_)

        y_tr_pred = predict_labels(w, x_tr)
        train_loss = (y_tr_pred == y_tr).mean()

        y_te_pred = predict_labels(w, x_va)
        test_loss = (y_te_pred == y_va).mean()

        test_losses.append(test_loss)

    losses = np.array(test_losses)
    return losses.mean() * 100, losses.std() * 100

In [114]:
k_fold = 5
iters = 4

lambdas = [0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
def create_masks(length, size):
    return list(it.combinations(range(length), size))
def poly_mask_from_to(idx):
    return (xs_standardized[idx].shape[1] * 2, poly_features[idx].shape[1])
def combinations_features_from_to(idx):
    return (0, combinations_features[idx].shape[1])

In [115]:
ys_train = [y_train[mask[:train_size]] for mask in data_masks]

In [118]:
def find_best_standardized(lambdas, xs_standardized):
    mask_size = 1
    for i_x, (y, x) in tqdm(enumerate(zip(ys_train, xs_standardized)), ncols=100):
        
        for lambda_ in lambdas:
            for m in create_masks(x.shape[1], mask_size):
                scores = []
                stds = []
                for _ in range(iters):
                    score, std = cross_validation_ridge(y, x, k_fold, lambda_)
                    scores.append(score)
                    stds.append(std)
                score = np.array(scores).mean()
                std = np.array(std).mean()
                remember_n_best(score, [lambda_, m])
        reset_n_best("best_std_{i}_{s}.csv".format(i=i_x+1, s=mask_size))
                

In [123]:
find_best_standardized(lambdas, xs_standardized)

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
lambda_ = 0.001

k_fold = 6
iters = 2

losses = []

ys_train = [y_train[mask[:train_size]] for mask in data_masks]

for y_tr, x_tr in zip(ys_train, xs_train):
    v_losses = []
    for _ in range(iters):
        loss = cross_validation_ridge(y_tr, x_tr, k_fold, lambda_)
        v_losses.extend(loss)
    losses.append(np.array(v_losses).mean() * x_tr.shape[0] / train_size)
    np_v_losses = np.array(v_losses)
    
    print("Test Error Mean = {}".format(np_v_losses.mean() * 100))
    print("Test Error St.D = {}".format(np_v_losses.std() * 100))
    
np_losses = np.array(losses)
print("Final Test Error = {}".format(np_losses.sum() * 100))