In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import log_loss

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

In [2]:
#Import and split data
#data = pd.read_csv('../input/voice.csv')
data_train = pd.read_csv('train_features.csv')
data_test = pd.read_csv('test_features.csv')
data_train_target_ns = pd.read_csv('train_targets_nonscored.csv')
data_train_target_s = pd.read_csv('train_targets_scored.csv')
ss = pd.read_csv('sample_submission.csv')


#Preprocess
#Categorize data


def preprocess(df):
    df = df.copy()
    # transform treatment group
    # transform vehicule group
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    
    # transform D1 and D2 group
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    
    # # transform 
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    del df['sig_id']
    return df



train = preprocess(data_train)
test = preprocess(data_test)
del data_train_target_s['sig_id']
top_features = list(range(1, len(train.columns)))

In [3]:
train.head()

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0,0,0,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,0,2,0,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,0,1,0,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,0,1,0,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,0,2,1,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [4]:
def create_model(num_columns):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(num_columns))
        
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(510, activation='relu')))
        
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(250, activation='relu')))

    #============ Final Layer =================
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid")))

    model.compile(optimizer=tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 700), 
                  loss='binary_crossentropy', metrics=['accuracy']
                  )
    return model

In [5]:
def metric(y_true, y_pred):
    metrics = []
    for _target in data_train_target_s.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
N_STARTS = 9

res = data_train_target_s.copy()
ss.loc[:, data_train_target_s.columns] = 0
res.loc[:, data_train_target_s.columns] = 0

top_feats = list(range(0, 785))
historys = dict()

#print (ss.loc[:, data_train_target_s.columns])

tf.random.set_seed(43)
for seed in range(N_STARTS):
    for n, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits=7, random_state=seed*3, shuffle=True).split(data_train_target_s, data_train_target_s)):
        print(f"======{data_train_target_s.values[tr].shape}========{data_train_target_s.values[te].shape}=====")
        print(f'Seed: {seed} => Fold: {n}')
        
        model = create_model(len(top_feats))
        
        checkpoint_path = f'repeat:{seed}_Fold:{n}.hdf5'
        
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, patience=3, verbose=1, mode='min')
        cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 1, save_best_only = True,
                                     save_weights_only = True, mode = 'min')
        early = EarlyStopping(monitor="val_loss", mode="min", restore_best_weights=True, patience= 5, verbose = 1)
        
        history = model.fit(train.values[tr][:, top_feats], data_train_target_s.values[tr], validation_data=(train.values[te][:, top_feats], data_train_target_s.values[te]),
                  epochs=100, batch_size=128,
                  callbacks=[reduce_lr_loss, cb_checkpt, early], verbose=2)
            
        historys[f'history_{seed+1}'] = history
        print("Model History Saved.")
        
        model.load_weights(checkpoint_path)
        test_predict = model.predict(test.values[:, top_feats])
        val_predict = model.predict(train.values[te][:, top_feats])
        
        ss.loc[:, data_train_target_s.columns] += test_predict
        res.loc[te, data_train_target_s.columns] += val_predict
        
        print(f'OOF Metric For SEED {seed} => FOLD {n} : {metric(data_train_target_s.loc[te, data_train_target_s.columns], pd.DataFrame(val_predict, columns=data_train_target_s.columns))}')
        print('+-' * 10)
    
ss.loc[:, data_train_target_s.columns] /= ((n+1) * N_STARTS)
res.loc[:, data_train_target_s.columns] /= N_STARTS

Seed: 0 => Fold: 0
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.13697, saving model to repeat:0_Fold:0.hdf5
160/160 - 2s - loss: 0.5117 - accuracy: 0.0164 - val_loss: 0.1370 - val_accuracy: 0.0068
Epoch 2/100

Epoch 00002: val_loss improved from 0.13697 to 0.03587, saving model to repeat:0_Fold:0.hdf5
160/160 - 2s - loss: 0.0718 - accuracy: 0.0232 - val_loss: 0.0359 - val_accuracy: 0.0259
Epoch 3/100

Epoch 00003: val_loss improved from 0.03587 to 0.02496, saving model to repeat:0_Fold:0.hdf5
160/160 - 2s - loss: 0.0313 - accuracy: 0.0378 - val_loss: 0.0250 - val_accuracy: 0.0617
Epoch 4/100

Epoch 00004: val_loss improved from 0.02496 to 0.02196, saving model to repeat:0_Fold:0.hdf5
160/160 - 2s - loss: 0.0243 - accuracy: 0.0442 - val_loss: 0.0220 - val_accuracy: 0.0453
Epoch 5/100

Epoch 00005: val_loss improved from 0.02196 to 0.02065, saving model to repeat:0_Fold:0.hdf5
160/160 - 2s - loss: 0.0222 - accuracy: 0.0453 - val_loss: 0.0206 - val_accuracy: 0.0359
Epoch 6/10

Epoch 43/100

Epoch 00043: val_loss did not improve from 0.01531
160/160 - 3s - loss: 0.0148 - accuracy: 0.1406 - val_loss: 0.0153 - val_accuracy: 0.1223
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.01531
Restoring model weights from the end of the best epoch.
160/160 - 3s - loss: 0.0148 - accuracy: 0.1386 - val_loss: 0.0153 - val_accuracy: 0.1235
Epoch 00044: early stopping
Model History Saved.
OOF Metric For SEED 0 => FOLD 0 : 0.015305935949132215
+-+-+-+-+-+-+-+-+-+-
Seed: 0 => Fold: 1
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.14407, saving model to repeat:0_Fold:1.hdf5
160/160 - 3s - loss: 0.5142 - accuracy: 0.0136 - val_loss: 0.1441 - val_accuracy: 0.0024
Epoch 2/100

Epoch 00002: val_loss improved from 0.14407 to 0.03515, saving model to repeat:0_Fold:1.hdf5
160/160 - 2s - loss: 0.0715 - accuracy: 0.0212 - val_loss: 0.0351 - val_accuracy: 0.0511
Epoch 3/100

Epoch 00003: val_loss improved from 0.03515 to 0.02458, saving model to repeat:0_Fold:1.hdf5


Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 1e-05.

Epoch 00040: val_loss improved from 0.01509 to 0.01509, saving model to repeat:0_Fold:1.hdf5
160/160 - 3s - loss: 0.0148 - accuracy: 0.1365 - val_loss: 0.0151 - val_accuracy: 0.1273
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.01509
160/160 - 2s - loss: 0.0149 - accuracy: 0.1371 - val_loss: 0.0151 - val_accuracy: 0.1273
Epoch 42/100

Epoch 00042: val_loss improved from 0.01509 to 0.01509, saving model to repeat:0_Fold:1.hdf5
160/160 - 3s - loss: 0.0148 - accuracy: 0.1369 - val_loss: 0.0151 - val_accuracy: 0.1273
Epoch 43/100

Epoch 00043: val_loss improved from 0.01509 to 0.01509, saving model to repeat:0_Fold:1.hdf5
160/160 - 3s - loss: 0.0148 - accuracy: 0.1376 - val_loss: 0.0151 - val_accuracy: 0.1276
Epoch 44/100

Epoch 00044: val_loss improved from 0.01509 to 0.01508, saving model to repeat:0_Fold:1.hdf5
160/160 - 3s - loss: 0.0148 - accuracy: 0.1359 - val_loss: 0.0151 - val_accuracy: 0.12

Epoch 33/100

Epoch 00033: val_loss improved from 0.01551 to 0.01549, saving model to repeat:0_Fold:2.hdf5
160/160 - 3s - loss: 0.0151 - accuracy: 0.1348 - val_loss: 0.0155 - val_accuracy: 0.1235
Epoch 34/100

Epoch 00034: val_loss improved from 0.01549 to 0.01549, saving model to repeat:0_Fold:2.hdf5
160/160 - 3s - loss: 0.0150 - accuracy: 0.1343 - val_loss: 0.0155 - val_accuracy: 0.1208
Epoch 35/100

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00035: val_loss improved from 0.01549 to 0.01544, saving model to repeat:0_Fold:2.hdf5
160/160 - 3s - loss: 0.0149 - accuracy: 0.1350 - val_loss: 0.0154 - val_accuracy: 0.1287
Epoch 36/100

Epoch 00036: val_loss improved from 0.01544 to 0.01539, saving model to repeat:0_Fold:2.hdf5
160/160 - 3s - loss: 0.0147 - accuracy: 0.1408 - val_loss: 0.0154 - val_accuracy: 0.1308
Epoch 37/100

Epoch 00037: val_loss improved from 0.01539 to 0.01535, saving model to repeat:0_Fold:2.hdf5
160/160 - 3s - loss: 0.0146