In [None]:
import pandas as pd
import pyarrow.parquet as pq
import os

import gc
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

print(os.listdir('../input'))

# memory reduce function
# https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

df_train = pd.read_csv("../input/metadata_train.csv")
df_train = reduce_mem_usage(df_train)
print("train : ", df_train.shape)

df_test = pd.read_csv("../input/metadata_test.csv")
df_test = reduce_mem_usage(df_test)
print("test  : ", df_test.shape)

df_train.head()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.exceptions import UndefinedMetricWarning

def MCC(tn, fp, fn, tp) :
    try : 
        return ((tp*tn)-(fp*fn)) / np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    except :
        return 0.0

def eval_predict(predict, target) :
    threshold = []
    mcc = []
    precision = []
    recall = []
    f1 = []
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category = UndefinedMetricWarning)
        
        for t in np.arange(0.0, 1.0, 0.001) :
            tn, fp, fn, tp = confusion_matrix(target == 1, predict > t).ravel()
            if np.isnan(MCC(tn, fp, fn, tp)) : continue
            threshold.append(t)
            mcc.append(MCC(tn, fp, fn, tp))
            precision.append(precision_score(target == 1, predict > t))
            recall.append(recall_score(target == 1, predict > t))
            f1.append(f1_score(target == 1, predict > t) )

    if len(mcc) == 0 :
        print("no valid result!")
        return 0.0, 5.0
    else :
        best_idx = np.argmax(np.array(mcc))
        print( 'best result - threshold : ', "{0:.3f}".format(threshold[best_idx])
              ,' MCC : ', "{0:.2f}".format(mcc[best_idx])
              ,' precision : ', "{0:.2f}".format(precision[best_idx])
              ,' recall : ', "{0:.2f}".format(recall[best_idx])
              ,' f1_score : ', "{0:.2f}".format(f1[best_idx])
             )
        return mcc[best_idx], threshold[best_idx]

In [None]:
from multiprocess import Pool, current_process
from sklearn.preprocessing import MinMaxScaler

import numba
import pywt
from scipy.signal import periodogram
from scipy.signal import find_peaks
from scipy.signal import peak_widths
from tqdm import tqdm

@numba.jit
def get_features(x):
    max_val = x[0]
    min_val = x[0]
    for i in x[1:]:
        if i > max_val:
            max_val = i
        elif i < min_val:
            min_val = i
    avg_val = np.median(x)
    #avg_val = np.mean(x)
    return [
        abs(min_val - avg_val)
        , (max_val - avg_val)
        , np.std(x)
        , abs(abs(min_val - avg_val) - (max_val - avg_val))
    ]

@numba.jit
def get_stats(x):
    mean = x[0]
    maximum = x[0]
    minimum = x[0]
    for i in x[1:]:
        mean += i
        if i > maximum:
            maximum = i
        elif i < minimum:
            minimum = i
    mean /= x.shape[0]
    return [
          mean,
          maximum,
          minimum,
          np.std(x)
    ]

def get_signal(idx, is_train = True, offset = 100, pool = 100) :
    
    if is_train :
        pq_dir = "../input/train.parquet"
    else :
        pq_dir = "../input/test.parquet"
    idx = np.array(idx)
    
    chunk = pq.read_pandas(pq_dir, columns=[str(i) for i in idx]).to_pandas()
    chunk = chunk.values
    chunk = chunk.T

    scaler = MinMaxScaler()
    global_stats = []
    local_stats = []
    for data in tqdm(chunk, mininterval = 1) :
    
        (ca, cd) = pywt.dwt(data,'haar')
        cat = pywt.threshold(ca, np.std(ca)/2, 'soft')
        cdt = pywt.threshold(cd, np.std(cd)/2, 'soft')
        data = pywt.idwt(cat, cdt, 'haar')

        """
        peaks = find_peaks(data)[0]
        peak_num = peaks.shape[0] / data.shape[0]
        width, height, _, _ = peak_widths(data, peaks)
        width /= data.shape[0]
        _, den = periodogram(data, 10e3)
        
        global_stats.append(
            [peak_num] + get_stats(width) + get_stats(height) + get_stats(den)
        )
        """
    
        for i in range(0, data.shape[0], offset) :
            local_stats.append(get_features(data[i : i + pool]))
            
    del chunk
    gc.collect()
    
    local_stats = np.array(local_stats, np.float32)
    local_stats = local_stats.reshape(idx.shape[0], -1, len(local_stats[0]))
    
    global_stats = np.array(global_stats, np.float32)
    
    return local_stats, global_stats

train_x = []
chunk_size = 1100
idx = df_train.index.values
s_idx = 0
e_idx = s_idx + chunk_size

job_list = []
while s_idx < idx.shape[0] :
    job_list.append(df_train.loc[idx[s_idx : e_idx], 'signal_id'])
    s_idx = s_idx + chunk_size
    e_idx = e_idx + chunk_size

pool = Pool(2)
train_x = pool.map(get_signal, job_list)
pool.close()

train_signal = [x[0] for x in train_x]
train_meta   = [x[1] for x in train_x]

train_signal = np.concatenate(train_signal)
train_meta = np.concatenate(train_meta)
train_y = df_train['target']

In [None]:
plt.figure(figsize=(24, 7))
for i in range(train_signal.shape[2]) :
    plt.plot(train_signal[0, :, i])

In [None]:
m_idx = df_train.groupby('id_measurement').agg({'target' : 'sum'})
m_idx.head()

In [18]:
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Flatten, Reshape, Dense, Dropout, Bidirectional, LSTM, CuDNNLSTM, Input, concatenate
from keras.layers import BatchNormalization, Activation, AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNGRU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

def get_callbacks(monitor = 'val_loss', verbose = 1) :
    callbacks = [
        EarlyStopping(
            monitor=monitor
            , mode='min'
            , verbose=verbose
            , restore_best_weights=False
            , patience=30
        ) 
        , ModelCheckpoint(
            filepath='bestModel.md'
            , monitor=monitor
            , mode='min'
            , verbose=0
            , save_best_only = True
        )
    ]
    return callbacks

def mcc_f(y_true, y_pred):
    y_pos_pred = K.round(K.clip(y_pred, 0, 1))
    y_pos_true = K.round(K.clip(y_true, 0, 1))
    
    y_neg_pred = 1 - y_pos_pred
    y_neg_true = 1 - y_pos_true

    tp = K.sum(y_pos_true * y_pos_pred)
    tn = K.sum(y_neg_true * y_neg_pred)
    fp = K.sum(y_neg_true * y_pos_pred)
    fn = K.sum(y_pos_true * y_neg_pred)
    return (tp * tn - fp * fn) / (K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + K.epsilon())

def get_model(input_shape
              , lr = 0.001
              , dropout = 0.0
              , l2_lambda = 0.0
              , verbose = False
              , attention = False
              , kernel_size = 100
              , filter_size = 1) :

    inp = Input(shape=input_shape)
    #inp_meta = Input(shape=(train_meta.shape[1],))

    x = Conv1D(kernel_size = (20)
                     , filters = 24
                     , kernel_regularizer = regularizers.l2(l2_lambda))(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    x = AveragePooling1D(pool_size = (5), strides=(4))(x)
    
    x = Conv1D(kernel_size = (20)
                     , filters = 24
                     , kernel_regularizer = regularizers.l2(l2_lambda))(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    x = AveragePooling1D(pool_size = (5), strides=(4))(x)

    x = Conv1D(kernel_size = (20)
                     , filters = 24
                     , kernel_regularizer = regularizers.l2(l2_lambda))(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    x = AveragePooling1D(pool_size = (5), strides=(4))(x)
    
    x = Conv1D(kernel_size = (20)
                     , filters = 24
                     , kernel_regularizer = regularizers.l2(l2_lambda))(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    x = AveragePooling1D(pool_size = (5), strides=(4))(x)
    
    #x = Bidirectional(CuDNNLSTM(50, return_sequences=True, kernel_regularizer=regularizers.l2(l2_lambda)))(x)
    x = Bidirectional(CuDNNGRU(50, return_sequences=True, kernel_regularizer=regularizers.l2(l2_lambda)))(x)
    #x = Dropout(dropout)(x)
    x = Attention(x.shape[-2])(x)
    
    #x = concatenate([inp_meta, x])
    
    x = Dense(100, activation = None
             , kernel_regularizer=regularizers.l2(l2_lambda)
             )(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    
    x = Dense(50, activation = None
             , kernel_regularizer=regularizers.l2(l2_lambda)
             )(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)
    
    x = Dense(1, activation = "sigmoid"
             , kernel_regularizer=regularizers.l2(l2_lambda)
             )(x)

    model = Model(inputs=inp, outputs=x)
    #model = Model(inputs=[inp, inp_meta], outputs=x)
    
    adam = Adam(lr)
    model.compile(loss='binary_crossentropy', optimizer=adam)

    if verbose :
        print(model.summary())
        print("lr      : ", lr)
        print("dropout : ", dropout)
        print("l2      : ", l2_lambda)
        
    return model

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import matthews_corrcoef

batch_size = 128
epoch = 500
k = 5

do_shuffle = True
random_seed = None

#do_shuffle = False
#random_seed = 2019

np.random.seed(random_seed)

models = []
thresholds = []
mccs = []
validate_mccs = []
validate_loss = []

gc.collect()

print("batch size : ", batch_size)
print("epoch : ", epoch)
print("k : ", k)

#kfold = StratifiedKFold(k, shuffle = do_shuffle)
#for i, (train_idx, validate_idx) in enumerate(kfold.split(df_train, df_train['target'])) :
#kfold = KFold(k, shuffle = do_shuffle)
#for i, (train_idx, validate_idx) in enumerate(kfold.split(df_train)) :

trn_idx = []
val_idx = []

kfold = StratifiedKFold(k, shuffle = do_shuffle, random_state = random_seed)
for i, (train_idx, validate_idx) in enumerate(kfold.split(m_idx.index.values, m_idx.target.values > 0)) :

    train_idx = df_train.loc[df_train['id_measurement'].isin(m_idx.index.values[train_idx]), 'signal_id']
    validate_idx = df_train.loc[df_train['id_measurement'].isin(m_idx.index.values[validate_idx]), 'signal_id']
    
    trn_idx.append(train_idx)
    val_idx.append(validate_idx)
    
    model = get_model(train_signal[0].shape
                      , lr = 0.0002
                      , dropout = 0.2
                      , verbose = (i==0)
                      , attention = True
                      , filter_size = 5
                      #, l2_lambda = 0.000001
                     )
    
    print(i + 1, ' fold')
    
    model.fit(train_signal[train_idx], train_y[train_idx]
        #[train_signal[train_idx], train_meta[train_idx]], train_y[train_idx]
              , epochs = epoch
              , batch_size = batch_size
              , validation_data = (train_signal[validate_idx], train_y[validate_idx])
        #, validation_data = ([train_signal[validate_idx], train_meta[validate_idx]], train_y[validate_idx])
              , callbacks = get_callbacks()
             )
    
    model.load_weights('bestModel.md')
    
    validate_loss.append(model.evaluate(train_signal[validate_idx], train_y[validate_idx], batch_size = batch_size))
    print("loss :", validate_loss)
    
    predict = model.predict(train_signal[validate_idx], batch_size = batch_size)
    predict = predict.reshape(-1)
    mcc, threshold = eval_predict(predict, train_y[validate_idx])

    models.append(model)
    thresholds.append(threshold)
    mccs.append(mcc)
    
    print("mcc : " ,mccs)
    
    del model
    gc.collect()
    
    print('')
    
print("trained model performances :")
print("thresholds - ")
print(thresholds) 
print("mcc - ")
print(mccs)
print("avg mcc : ", sum(mccs) / len(mccs))
print("loss")
print(validate_loss)

In [None]:
print("trained model performances :")
print("thresholds - ")
print(thresholds) 
print("mcc - ")
print(mccs)
print("avg mcc : ", sum(mccs) / len(mccs))

In [None]:
trained model performances :
thresholds - 
[0.523, 0.426, 0.34800000000000003, 0.23, 0.127]
mcc - 
[0.6940941810723269, 0.7297282540363873, 0.7016325710944952, 0.7267056412680362, 0.7114783084132461]
avg mcc :  0.7127277911768984
loss
[0.09835298062738713, 0.09363567445275432, 0.09560776766276333, 0.09584901007389243, 0.08889788707782482]


In [None]:
del train_signal
del train_meta
del train_y
gc.collect()

chunk_size = 1100
idx = df_test.index.values
s_idx = 0
e_idx = s_idx + chunk_size
predict = np.zeros(idx.shape[0])

while s_idx < idx.shape[0] :
    print("predict ", s_idx, '~', e_idx)
    
    job_list = [
        [df_test.loc[idx[s_idx : int((s_idx + e_idx)/2)], 'signal_id'].values, False]
        , [df_test.loc[idx[int((s_idx + e_idx)/2) : e_idx], 'signal_id'].values, False]
               ]
    job_list = [job for job in job_list if job[0].shape[0] != 0]
    
    pool = Pool(2)
    x = pool.starmap(get_signal, job_list)
    pool.close()
    
    signal = [x_[0] for x_ in x]
    meta   = [x_[1] for x_ in x]

    signal = np.concatenate(signal)
    meta = np.concatenate(meta)
    
    for i, (model, threshold) in enumerate(zip(models, thresholds)) :
        #predict[s_idx : e_idx] += model.predict([signal, meta], batch_size = batch_size).reshape(-1) > threshold
        predict[s_idx : e_idx] += model.predict(signal, batch_size = batch_size).reshape(-1) > threshold
        #predict[s_idx : e_idx] += model.predict(signal, batch_size = batch_size).reshape(-1)
    
    s_idx = s_idx + chunk_size
    e_idx = e_idx + chunk_size
    
    del x
    del signal
    del meta
    gc.collect()

In [None]:
submission = pd.DataFrame()
submission['signal_id'] = df_test['signal_id']
submission['target'] = np.array((predict / k) > validate_threshold, dtype = np.int8)
submission.to_csv('submission_blend.csv', index = False)

print(submission[submission['target']==1].shape[0] / submission.shape[0])
submission['target'].hist()
plt.show()
submission.head()

In [None]:
for vote in range(1, k + 1) :
    print("result with vote threshold", vote)
    
    predict_vote = (predict >= vote)

    submission = pd.DataFrame()
    submission['signal_id'] = df_test['signal_id']
    submission['target'] = np.array(predict_vote, dtype = np.int8)
    submission.to_csv('submission' + str(vote) + '.csv', index = False)
    
    print(submission[submission['target']==1].shape[0] / submission.shape[0])
    submission['target'].hist()
    plt.show()