# TL; DR

Just adding some small improvements to the original kernel. Using a 5-fold instead of a single model, also a simple threshold search instead of using a flat value.  

In [1]:
import pandas as pd
import pyarrow.parquet as pq
import os
import numpy as np
from keras.layers import *
from keras.models import Model
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras import optimizers
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from keras.callbacks import *
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
N_SPLITS = 5
random_seed = 2019 # 시드 고정시 LB 6.08
np.random.seed(random_seed)

In [3]:
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [4]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [5]:
df_train = pd.read_csv('../input/metadata_train.csv')
df_train = df_train.set_index(['id_measurement', 'phase'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,signal_id,target
id_measurement,phase,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
0,1,1,0
0,2,2,0
1,0,3,1
1,1,4,1


In [6]:
max_num = 127
min_num = -128

In [7]:
def min_max_transf(ts, min_data, max_data, range_needed=(-1,1)):
    if min_data < 0:
        ts_std = (ts + abs(min_data)) / (max_data + abs(min_data))
    else:
        ts_std = (ts - min_data) / (max_data - min_data)
    if range_needed[0] < 0:    
        return ts_std * (range_needed[1] + abs(range_needed[0])) + range_needed[0]
    else:
        return ts_std * (range_needed[1] - range_needed[0]) + range_needed[0]

In [8]:
from scipy.signal import periodogram
from scipy.signal import find_peaks
from scipy.signal import peak_widths

def transform_ts(ts, n_dim=160, min_max=(-1,1)):
    ts_std = min_max_transf(ts, min_data=min_num, max_data=max_num)
    sample_size = 800000
    bucket_size = int(sample_size / n_dim)
    new_ts = []
    for i in range(0, sample_size, bucket_size):
        ts_range = ts_std[i:i + bucket_size]
        mean = ts_range.mean()
        std = ts_range.std()
        std_top = mean + std
        std_bot = mean - std
        percentil_calc = np.percentile(ts_range, [0, 1, 25, 50, 75, 99, 100])
        max_range = percentil_calc[-1] - percentil_calc[0]
        covar = std / mean
        asymmetry = mean - percentil_calc[4]
        
        peaks = find_peaks(ts_range)[0]
        peak_num = peaks.shape[0] / bucket_size
        width, height, _, _ = peak_widths(ts_range, peaks)
        width /= bucket_size
        #_, den = periodogram(ts_range, 10e3)
        
        new_ts.append(np.concatenate([np.asarray([mean, std_top, std_bot, max_range, covar, asymmetry, peak_num
                                                 , np.max(width)
                                                 , np.min(width)
                                                 #, height
                                                 #, den
                                                 ]),percentil_calc]))
    return np.asarray(new_ts)

In [9]:
def prep_data(start, end):
    #praq_train = pq.read_pandas('../input/train.parquet').to_pandas()
    praq_train = pq.read_pandas('../input/train.parquet', columns=[str(i) for i in range(start, end)]).to_pandas()
    X = []
    y = []
    #for id_measurement in tqdm(df_train.index.levels[0].unique()):
    for id_measurement in tqdm(df_train.index.levels[0].unique()[int(start/3):int(end/3)]):
        X_signal = []
        for phase in [0,1,2]:
            signal_id, target = df_train.loc[id_measurement].loc[phase]
            if phase == 0:
                y.append(target)
            X_signal.append(transform_ts(praq_train[str(signal_id)]))
        X_signal = np.concatenate(X_signal, axis=1)
        X.append(X_signal)
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y

In [10]:
X = []
y = []
def load_all():
    total_size = len(df_train)
    for ini, end in [(0, int(total_size/2)), (int(total_size/2), total_size)]:
        X_temp, y_temp = prep_data(ini, end)
        X.append(X_temp)
        y.append(y_temp)
load_all()
X = np.concatenate(X)
y = np.concatenate(y)

100%|██████████| 1452/1452 [12:50<00:00,  1.85it/s]
100%|██████████| 1452/1452 [12:46<00:00,  1.91it/s]


In [11]:
X.shape

(2904, 160, 48)

In [28]:
X[0][0]

array([ 1.43223529e-01,  1.50358821e-01,  1.36088238e-01,  4.70588235e-02,
        4.98192709e-02, -1.87450980e-03,  3.12800000e-01,  4.49166667e-03,
        1.33333333e-04,  1.21568627e-01,  1.29411765e-01,  1.37254902e-01,
        1.45098039e-01,  1.45098039e-01,  1.60784314e-01,  1.68627451e-01,
        8.50509804e-03,  1.54080907e-02,  1.60210540e-03,  5.49019608e-02,
        8.11629991e-01, -3.25960784e-03,  3.11400000e-01,  3.45000000e-03,
        1.33333333e-04, -1.96078431e-02, -3.92156863e-03,  3.92156863e-03,
        1.17647059e-02,  1.17647059e-02,  1.96078431e-02,  3.52941176e-02,
       -1.50202353e-01, -1.42045959e-01, -1.58358747e-01,  7.05882353e-02,
       -5.43027037e-02, -5.10431373e-03,  3.00600000e-01,  6.50000000e-03,
        1.33333333e-04, -1.84313725e-01, -1.68627451e-01, -1.52941176e-01,
       -1.52941176e-01, -1.45098039e-01, -1.29411765e-01, -1.13725490e-01])

In [12]:
def model_lstm(input_shape):
    inp = Input(shape=(input_shape[1], input_shape[2],))
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(inp)
    #x = Dropout(0.5)(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    #x = Dropout(0.5)(x)
    x = Attention(input_shape[1])(x)
    #x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    #x = Dropout(0.5)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[matthews_correlation])
    
    return model

In [None]:
splits = list(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=random_seed).split(X, y))

preds_val = []
y_val = []
e_val = []
for idx, (train_idx, val_idx) in enumerate(splits):
    K.clear_session()
    print("Beginning fold {}".format(idx+1))
    train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx]
    model = model_lstm(train_X.shape)
    ckpt = ModelCheckpoint('weights_{}.h5'.format(idx), save_best_only=True, save_weights_only=True, verbose=1, monitor='val_matthews_correlation', mode='max')
    ealy = EarlyStopping(monitor='val_matthews_correlation', mode='max', verbose=True, restore_best_weights=False, patience=30) 
    model.fit(train_X, train_y, batch_size=128, epochs=200, validation_data=[val_X, val_y], callbacks=[ckpt, ealy])
    model.load_weights('weights_{}.h5'.format(idx))
    preds_val.append(model.predict(val_X, batch_size=512))
    e_val.append(model.evaluate(val_X, val_y)[-1])
    y_val.append(val_y)

print(e_val)
print(sum(e_val) / len(e_val))
preds_val = np.concatenate(preds_val)[...,0]
y_val = np.concatenate(y_val)
preds_val.shape, y_val.shape

Beginning fold 1


In [None]:
[0.6582337490881431, 0.6328802896653929, 0.6546059262116477, 0.649783076911137, 0.4846157267175872]
0.6160237537187816

In [14]:
#base cv :
[0.6836715137835631, 0.6373722286519824, 0.6253042532942176, 0.6382983437899885, 0.4866593619872784]
0.6142611403014059
#base lb : 0.608

#peak num cv :
[0.7324769439566177, 0.6466588514396944, 0.6504106250769505, 0.5966032801003291, 0.4520105374270472]
0.6156320476001278
#peak num lb : 0.620

#dropout
[0.7371539020866054, 0.6015985065401114, 0.6219396574772164, 0.5904187784112733, 0.46720499293557527]
0.6036631674901564

[0.7056771373421056, 0.627739349864231, 0.6946000002337405, 0.6305348889581088, 0.4459432509438745]
0.620898925468412

0.620898925468412

In [15]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = K.eval(matthews_correlation(y_true.astype(np.float64), (y_proba > threshold).astype(np.float64)))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'matthews_correlation': best_score}
    return search_result

In [16]:
best_threshold = threshold_search(y_val, preds_val)['threshold']

100%|██████████| 100/100 [00:21<00:00,  2.76it/s]


In [17]:
%%time
# 25ms in Kernel
meta_test = pd.read_csv('../input/metadata_test.csv')

CPU times: user 4 ms, sys: 8 ms, total: 12 ms
Wall time: 12.2 ms


In [18]:
meta_test = meta_test.set_index(['signal_id'])
meta_test.head()

Unnamed: 0_level_0,id_measurement,phase
signal_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8712,2904,0
8713,2904,1
8714,2904,2
8715,2905,0
8716,2905,1


In [19]:
%%time
# About 10min in Kernel
first_sig = meta_test.index[0]
n_parts = 10
max_line = len(meta_test)
part_size = int(max_line / n_parts)
last_part = max_line % n_parts
print(first_sig, n_parts, max_line, part_size, last_part, n_parts * part_size + last_part)
start_end = [[x, x+part_size] for x in range(first_sig, max_line + first_sig, part_size)]
start_end = start_end[:-1] + [[start_end[-1][0], start_end[-1][0] + last_part]]
print(start_end)
X_test = []
for start, end in start_end:
    subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i) for i in range(start, end)]).to_pandas()
    for i in tqdm(subset_test.columns):
        id_measurement, phase = meta_test.loc[int(i)]
        subset_test_col = subset_test[i]
        subset_trans = transform_ts(subset_test_col)
        X_test.append([i, id_measurement, phase, subset_trans])

8712 10 20337 2033 7 20337
[[8712, 10745], [10745, 12778], [12778, 14811], [14811, 16844], [16844, 18877], [18877, 20910], [20910, 22943], [22943, 24976], [24976, 27009], [27009, 29042], [29042, 29049]]


100%|██████████| 2033/2033 [05:45<00:00,  5.84it/s]
100%|██████████| 2033/2033 [05:52<00:00,  5.74it/s]
100%|██████████| 2033/2033 [05:57<00:00,  5.69it/s]
100%|██████████| 2033/2033 [05:55<00:00,  5.62it/s]
100%|██████████| 2033/2033 [05:56<00:00,  5.45it/s]
100%|██████████| 2033/2033 [06:00<00:00,  5.45it/s]
100%|██████████| 2033/2033 [06:00<00:00,  5.57it/s]
100%|██████████| 2033/2033 [06:02<00:00,  5.46it/s]
100%|██████████| 2033/2033 [05:59<00:00,  5.34it/s]
100%|██████████| 2033/2033 [06:07<00:00,  5.71it/s]
100%|██████████| 7/7 [00:01<00:00,  5.51it/s]

CPU times: user 1h 1min 20s, sys: 37.1 s, total: 1h 1min 57s
Wall time: 1h 1min 44s





In [20]:
X_test_input = np.asarray([np.concatenate([X_test[i][3],X_test[i+1][3], X_test[i+2][3]], axis=1) for i in range(0,len(X_test), 3)])
np.save("X_test.npy",X_test_input)
X_test_input.shape

(6779, 160, 48)

In [21]:
submission = pd.read_csv('../input/sample_submission.csv')
print(len(submission))
submission.head()

20337


Unnamed: 0,signal_id,target
0,8712,0
1,8713,0
2,8714,0
3,8715,0
4,8716,0


In [22]:
preds_test = []
for i in range(N_SPLITS):
    model.load_weights('weights_{}.h5'.format(i))
    pred = model.predict(X_test_input, batch_size=300, verbose=1)
    pred_3 = []
    for pred_scalar in pred:
        for i in range(3):
            pred_3.append(pred_scalar)
    preds_test.append(pred_3)



In [23]:
preds_test = (np.squeeze(np.mean(preds_test, axis=0)) > best_threshold).astype(np.int)
preds_test.shape

(20337,)

In [24]:
submission['target'] = preds_test
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,signal_id,target
0,8712,0
1,8713,0
2,8714,0
3,8715,0
4,8716,0
