In [1]:
import numpy as np
import itertools as it
import sys

from helpers import *

from tqdm import tqdm
from implementations import *
from cross_validation import build_k_fold_sets
from feature_eng import *
from data_cleaning import replace_invalid
from plots import display_features

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
y_train, x_brute_train, _ = load_csv_data("../data/train.csv")

In [3]:
y_test, x_brute_test, _ = load_csv_data("../data/test.csv")

In [4]:
x_brute = np.concatenate((x_brute_train, x_brute_test))
x_brute.shape

(818238, 30)

In [5]:
mask_invalid = x_brute == -999.0
x_replaced_invalid = replace_invalid(x_brute, mask_invalid, replace_by="mean")
x_replaced_invalid.shape

(818238, 30)

In [6]:
def verify_masks(masks):
    total = 0
    for mask in masks:
        total += mask.sum()
    assert total == x_brute.shape[0]
    return len(masks)

In [7]:
x_min_values = np.ma.array(x_brute, mask=mask_invalid).min(axis=0)
x_min_values.shape

(30,)

In [8]:
limit = 2.7
masks_feat_8 = [
    np.log(x_replaced_invalid - x_min_values + 1)[:, 8] <= limit,
    np.log(x_replaced_invalid - x_min_values + 1)[:, 8] > limit
]

verify_masks(masks_feat_8)

2

In [9]:
masks_feat_22 = [
    x_replaced_invalid[:, 22] == 0,
    x_replaced_invalid[:, 22] == 1,
    x_replaced_invalid[:, 22] > 1
]

verify_masks(masks_feat_22)

3

In [10]:
data_masks = []
for mask_i in masks_feat_8:
    for mask_j in masks_feat_22:
        data_masks.append(mask_i & mask_j)
        print((mask_i & mask_j).sum())
        
verify_masks(data_masks)

192086
133623
92524
135285
119259
145461


6

In [11]:
invalid_features_mask = [mask_invalid[mask].sum(axis=0) != mask.sum() for mask in data_masks]

In [12]:
xs_replaced_invalid = []
xs_features_mask = []

for idx, mask in enumerate(data_masks):
    # remove features containing only invalid features
    mask_1 = invalid_features_mask[idx]
    # & remove features with std being 0 (because of conditioning over feature 22 f.ex.)
    mask_2 =  x_replaced_invalid[mask].std(axis=0) != 0
    
    features_mask = mask_1 & mask_2
    
    xs_features_mask.append(features_mask)
    xs_replaced_invalid.append(x_replaced_invalid[mask][:, features_mask])
    
[x.shape for x in xs_replaced_invalid]

[(192086, 18),
 (133623, 22),
 (92524, 30),
 (135285, 18),
 (119259, 22),
 (145461, 30)]

In [13]:
xs_brute = [x_brute[data_mask][:, f_mask] for data_mask, f_mask in zip(data_masks, xs_features_mask)]

[x.shape for x in xs_brute]

[(192086, 18),
 (133623, 22),
 (92524, 30),
 (135285, 18),
 (119259, 22),
 (145461, 30)]

In [14]:
xs_log = []

for x in xs_replaced_invalid:
    mask_negative_values = ~np.any(x <= 0, axis=0)
    non_negative_features = x[:, mask_negative_values]

    xs_log.append(np.log(non_negative_features))
    
[f.shape for f in xs_log]

[(192086, 10),
 (133623, 12),
 (92524, 16),
 (135285, 12),
 (119259, 12),
 (145461, 16)]

In [16]:
xs_mix = [np.concatenate((x1, x2), axis=1) for x1, x2 in zip(xs_replaced_invalid, xs_log)]

[f.shape for f in xs_mix]

[(192086, 28),
 (133623, 34),
 (92524, 46),
 (135285, 30),
 (119259, 34),
 (145461, 46)]

In [None]:
mix_to_mask = [[4,7,20,21],
 [4, 11, 25],
 [9, 10, 15, 23, 38],
 [0, 1, 7, 8, 12, 15, 22, 25, 26, 28, 29],
 [9, 12, 23, 31],
 [2, 4, 24, 25, 27, 29, 37, 44]]


xs_mix_best_only = [x[:, ~range_mask(x.shape[1], m)] for x, m in zip(xs_mix, mix_to_mask)]

[f.shape for f in xs_mix_best_only]

In [16]:
xs_standardized = [standardize(x) for x in xs_mix]

[x.shape for x in xs_standardized]

[(192086, 28),
 (133623, 34),
 (92524, 46),
 (135285, 30),
 (119259, 34),
 (145461, 46)]

In [17]:
degrees =  [11] * len(data_masks)
poly_features = [build_poly(x, degree) for x, degree in tqdm(zip(xs_standardized, degrees), ncols=100)]

[f.shape for f in poly_features]

6it [00:29,  4.94s/it]


[(192086, 336),
 (133623, 408),
 (92524, 552),
 (135285, 360),
 (119259, 408),
 (145461, 552)]

In [18]:
combinations_over = xs_standardized

combinations_features = []
for x in tqdm(combinations_over, ncols=100):
    combinations = []
    for i in range(x.shape[1]):
        for j in range(x.shape[1]):
            if i != j:
                comb = (x[:, i] * x[:, j]).reshape((x.shape[0], 1))
                combinations.append(comb)
    combinations_features.append(np.concatenate(combinations, axis=1))


[f.shape for f in combinations_features]

100%|█████████████████████████████████████████████████████████████████| 6/6 [01:31<00:00, 15.26s/it]


[(192086, 756),
 (133623, 1122),
 (92524, 2070),
 (135285, 870),
 (119259, 1122),
 (145461, 2070)]

In [19]:
train_size, test_size = x_brute_train.shape[0], x_brute_test.shape[0]
train_mask = np.r_[[True] * train_size, [False] * test_size]
xs_train_size = [(mask & train_mask).sum() for mask in data_masks]

[size for size in xs_train_size]

[58823, 40904, 28236, 41090, 36640, 44307]

# Parameters tuning

In [199]:
def to_csv(scores, params, masks, filename):    
    with open(filename, "a", encoding="utf8") as output_file:
        writer = csv.writer(output_file, delimiter=',')
        writer.writerows(zip(scores, params, masks))

In [154]:
class Remember:
    
    def __init__(self, n):
        self.remember_n = n
        self.best_scores = [0] * n
        self.best_masks = [(-1,)] * n
        self.best_params = [None] * n
        
    def reset(self):
        self.best_scores = [0] * self.remember_n
        self.best_masks = [(-1,)] * self.remember_n
        self.best_params = [None] * self.remember_n
    
    def add_score(self, score, mask_index, params):    
        min_score = np.array(self.best_scores).min()
        if min_score < score:
            idx = self.best_scores.index(min_score)
            
            self.best_scores[idx] = score
            self.best_masks[idx] = mask_index
            self.best_params[idx] = params
            #print("new high score = {s} with params = {p}".format(s=score, p=params))
            return True
        return False
    
    def best_mask(self):
        max_score = np.array(self.best_scores).max()
        idx = self.best_scores.index(max_score)
        return self.best_masks[idx]

In [155]:
def cross_validation_ridge(y_train, x_train, k_fold, lambda_, seed=np.random.seed()):
    test_losses = []
    for x_tr, x_va, y_tr, y_va in build_k_fold_sets(y_train, x_train, k_fold, seed):
        w = ridge_regression(y_tr, x_tr, lambda_)

        y_tr_pred = predict_labels(w, x_tr)
        train_loss = (y_tr_pred == y_tr).mean()

        y_te_pred = predict_labels(w, x_va)
        test_loss = (y_te_pred == y_va).mean()

        test_losses.append(test_loss)

    losses = np.array(test_losses)
    return losses.mean() * 100, losses.std() * 100

In [156]:
k_fold = 5
iters = 2

lambdas = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]


def range_mask(length, seq):
    return np.array([i in seq for i in range(length)])
def create_masks(length, size, exceptions=(-1,)):
    combinations = list(it.combinations(range(length), size))
    masks = [ ~range_mask(length, mask + exceptions) for mask in combinations]
    masks.append(~range_mask(length, list(exceptions)))
    return masks

def poly_mask_from_to(idx):
    return (xs_standardized[idx].shape[1] * 2, poly_features[idx].shape[1])

def combinations_features_from_to(idx):
    return (0, combinations_features[idx].shape[1])

In [157]:
ys_train = [y_train[mask[:train_size]] for mask in data_masks]

In [182]:
#personal_lambdas = [1e-04, 5e-05, 1e-05, 0.1, 5e-05, 1e-04]
#personal_masks = [(4, 7), (25, 4), (10,), (22,29), (23,), (4, 44)]

In [209]:
def find_best_standardized(lambdas, xs_standardized):
    mask_size = 1
    remember_size = 10   
    rem = Remember(remember_size)
    personal_masks = [[-1] for _ in range(len(data_masks))]
    masks_incomplete = [True] * len(data_masks)
    while(np.array(masks_incomplete).sum() > 0):
        for x_idx, (y, x) in tqdm(enumerate(zip(ys_train, xs_standardized)), ncols=100):
            rem.reset()
            if(masks_incomplete[x_idx]):
                for lambda_ in lambdas:
                    for m_idx, m in enumerate(create_masks(x.shape[1], mask_size, tuple(personal_masks[x_idx]))):
                        
                        # computing best scores and storing them
                        scores = []
                        stds = []
                        for _ in range(iters):
                            score, std = cross_validation_ridge(y, x[:, m], k_fold, lambda_)
                            scores.append(score)
                            stds.append(std)
                        score = np.array(scores).mean()
                        std = np.array(std).mean()
                        params = [lambda_, m]
                        
                        rem.add_score(score, m_idx, params)

                # add best mask if it improves the score
                best_mask = rem.best_mask()
                if(best_mask < x.shape[1] and best_mask not in personal_masks[x_idx]): # this works only if mask_size = 1
                    to_csv(rem.best_scores, rem.best_params, rem.best_masks, "tuning/best_std_{i}_{s}-{pm}.csv".format(i=x_idx+1, s=mask_size, pm=personal_masks[x_idx]))
                    personal_masks[x_idx].append(best_mask)
                else:
                    masks_incomplete[x_idx] = False
                             


In [210]:
find_best_standardized(lambdas, xs_standardized)

6it [14:58, 149.81s/it]
6it [13:47, 137.95s/it]
6it [15:07, 151.17s/it]
6it [15:35, 155.89s/it]
6it [10:57, 109.58s/it]
6it [07:22, 73.76s/it]
6it [05:25, 54.19s/it]
6it [05:40, 56.73s/it]
6it [05:27, 54.66s/it]
6it [01:09, 11.60s/it]
6it [01:06, 11.10s/it]
6it [01:02, 10.44s/it]


In [None]:
def find_best_standardized(lambdas, xs_standardized):
    mask_size = 1
    remember_size = 10   
    rem = Remember(remember_size)
    personal_masks = [[-1] for _ in range(len(data_masks))]
    masks_incomplete = [True] * len(data_masks)
    while(np.array(masks_incomplete).sum() > 0):
        for x_idx, (y, x) in tqdm(enumerate(zip(ys_train, xs_standardized)), ncols=100):
            rem.reset()
            if(masks_incomplete[x_idx]):
                for lambda_ in lambdas:
                    for m_idx, m in enumerate(create_masks(x.shape[1], mask_size, tuple(personal_masks[x_idx]))):
                        
                        # computing best scores and storing them
                        scores = []
                        stds = []
                        for _ in range(iters):
                            score, std = cross_validation_ridge(y, x[:, m], k_fold, lambda_)
                            scores.append(score)
                            stds.append(std)
                        score = np.array(scores).mean()
                        std = np.array(std).mean()
                        params = [lambda_, m]
                        
                        rem.add_score(score, m_idx, params)

                # add best mask if it improves the score
                best_mask = rem.best_mask()
                if(best_mask < x.shape[1] and best_mask not in personal_masks[x_idx]): # this works only if mask_size = 1
                    to_csv(rem.best_scores, rem.best_params, rem.best_masks, "tuning2/best_std_{i}_{s}-{pm}.csv".format(i=x_idx+1, s=mask_size, pm=personal_masks[x_idx]))
                    personal_masks[x_idx].append(best_mask)
                else:
                    masks_incomplete[x_idx] = False
                             


In [None]:
find_best_standardized(lambdas, xs_standardized)

6it [17:05, 170.99s/it]
6it [16:32, 165.42s/it]
6it [16:02, 160.48s/it]
6it [15:28, 154.72s/it]
6it [13:06, 131.05s/it]
6it [07:32, 75.38s/it] 
6it [06:41, 66.93s/it]
6it [06:29, 64.98s/it]
6it [06:33, 65.52s/it]
6it [01:25, 14.18s/it]
6it [01:18, 13.10s/it]


In [None]:
def grid_search_parameters(parameters):
    [assert type p == list for p in parameters]
    for p in parameters:
        

In [None]:
1 + 1