In [1]:
# Reference:
# https://www.kaggle.com/demetrypascal/fork-of-2heads-looper-super-puper-plate/notebook

kernel_mode = True

# Preparations

Let’s load the packages and provide some constants for our script:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from tensorflow.keras import layers, regularizers, Sequential, Model, backend, callbacks, optimizers, metrics, losses
import tensorflow as tf
import sys
import os
import random
import json
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import pickle
from pickle import dump, load
import glob

import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA"
model_output_folder = "../input/improving-mark-s-2-heads-model" if kernel_mode \
    else f"{PATH}/improving-mark-s-2-heads-model"
os.makedirs(model_output_folder, exist_ok=True)

# SEEDS = [23]
SEEDS = [23, 228, 1488, 1998, 2208, 2077, 404]
KFOLDS = 10

batch_size = 256
# batch_size = 128
# batch_size = 64

label_smoothing_alpha = 0.0005

P_MIN = label_smoothing_alpha
P_MAX = 1 - P_MIN

In [4]:
# Import train data, drop sig_id, cp_type
train_features = pd.read_csv(f'{PATH}/train_features.csv')

non_ctl_idx = train_features.loc[
    train_features['cp_type'] != 'ctl_vehicle'].index.to_list()

# Drop training data with ctl vehicle
tr = train_features.iloc[non_ctl_idx, :].reset_index(drop=True)

test_features = pd.read_csv(f'{PATH}/test_features.csv')
te = test_features.copy()

In [5]:
train_targets_scored = pd.read_csv(f'{PATH}/train_targets_scored.csv')
Y = train_targets_scored.drop('sig_id', axis=1)
Y = Y.iloc[non_ctl_idx, :].copy().reset_index(drop=True).values

train_targets_nonscored = pd.read_csv(f'{PATH}/train_targets_nonscored.csv')
Y0 = train_targets_nonscored.drop('sig_id', axis=1)
Y0 = Y0.iloc[non_ctl_idx, :].copy().reset_index(drop=True).values

sub = pd.read_csv(f'{PATH}/sample_submission.csv')
sub.iloc[:, 1:] = 0

# Features from t.test

Here I am getting most important predictors

In [6]:
# Import predictors from public kernel
json_file_path = '../input/t-test-pca-rfe-logistic-regression/main_predictors.json' if kernel_mode \
    else "/workspace/Kaggle/MoA/t-test-pca-rfe-logistic-regression/main_predictors.json"

with open(json_file_path, 'r') as j:
    predictors = json.loads(j.read())
    predictors = predictors['start_predictors']

In [7]:
second_Xtrain = tr[predictors].copy().values

second_Xtest = te[predictors].copy().values
second_Xtrain.shape

(21948, 447)

# Keras model

I got idea of **label smoothing** from this notebook: https://www.kaggle.com/kailex/moa-transfer-recipe-with-smoothing

In [8]:
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, P_MIN, P_MAX)
    return -backend.mean(y_true * backend.log(y_pred) +
                         (1 - y_true) * backend.log(1 - y_pred))

# Inference

In [9]:
numeric_features = [c for c in train_features.columns if c != "sig_id"]
gene_experssion_features = [c for c in numeric_features if c.startswith("g-")]
cell_viability_features = [c for c in numeric_features if c.startswith("c-")]
len(gene_experssion_features), len(cell_viability_features)

(772, 100)

In [10]:
tr = pd.get_dummies(tr, columns=["cp_time", "cp_dose"])

In [11]:
tr = tr.drop(['sig_id', 'cp_type'], axis=1)
te = test_features.drop(['sig_id', 'cp_type'], axis=1)

In [12]:
te = pd.get_dummies(te, columns=["cp_time", "cp_dose"])

In [13]:
def preprocessor_1(test, seed, scaler=None, pca_gs=None, pca_cs=None):
    # g-mean, c-mean
    test_g_mean = test[gene_experssion_features].mean(axis=1)

    test_c_mean = test[cell_viability_features].mean(axis=1)

    test_columns = test.columns.tolist()

    test = np.concatenate(
        (test, test_g_mean[:, np.newaxis], test_c_mean[:, np.newaxis]), axis=1)

    # Standard Scaler for Numerical Values
    test = pd.DataFrame(data=scaler.transform(test),
                        columns=test_columns + ["g_mean", "c_mean"])
    test_pca_gs = pca_gs.transform(test[gene_experssion_features].values)

    test_pca_cs = pca_cs.transform(test[cell_viability_features].values)

    # Append Features
    test = np.concatenate((test, test_pca_gs, test_pca_cs), axis=1)

    return test


def preprocessor_2(test, scaler=None):
    # Standard Scaler for Numerical Values
    test = scaler.transform(test)

    return test, scaler


def save_pickle(obj, model_output_folder, name):
    dump(obj, open(f"{model_output_folder}/{name}.pkl", 'wb'),
         pickle.HIGHEST_PROTOCOL)


def load_pickle(model_output_folder, name):
    return load(open(f"{model_output_folder}/{name}.pkl", 'rb'))


def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.mean(-logloss)

In [14]:
tr.shape, te.shape

((21948, 877), (3982, 877))

In [15]:
oof_predictions = np.zeros((tr.shape[0], Y.shape[1]))

y_pred = np.zeros((te.shape[0], 206))
for s in SEEDS:

    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)

    k = 0
    kf = KFold(n_splits=KFOLDS, shuffle=True, random_state=s)
    for train_index, valid_index in kf.split(tr):
        file_name = f"seed{s}_fold{k}"

        print(f"Inferencing on seed{s} fold{k} ......")

        scaler_1 = load_pickle(model_output_folder, f"{file_name}_scaler_1")
        pca_gs = load_pickle(model_output_folder, f"{file_name}_pca_gs")
        pca_cs = load_pickle(model_output_folder, f"{file_name}_pca_cs")
        X_test_1 = preprocessor_1(te, s, scaler_1, pca_gs, pca_cs)

        scaler_2 = load_pickle(model_output_folder, f"{file_name}_scaler_2")
        X_test_2, scaler_2 = preprocessor_2(second_Xtest, scaler_2)

        n_features = X_test_1.shape[1]
        n_features_2 = X_test_2.shape[1]

        # Model Definition #

        input1_ = layers.Input(shape=(n_features, ))
        input2_ = layers.Input(shape=(n_features_2, ))

        output1 = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Dense(512, activation="elu"),
            layers.BatchNormalization(),
            layers.Dense(256, activation="elu")
        ])(input1_)

        answer1 = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(512, "relu")
        ])(layers.Concatenate()([output1, input2_]))

        answer2 = Sequential([
            layers.BatchNormalization(),
            layers.Dense(512, "elu"),
            layers.BatchNormalization(),
            layers.Dense(256, "relu")
        ])(layers.Concatenate()([output1, input2_, answer1]))

        answer3 = Sequential(
            [layers.BatchNormalization(),
             layers.Dense(256,
                          "elu")])(layers.Concatenate()([answer1, answer2]))

        answer3_ = Sequential([
            layers.BatchNormalization(),
            layers.Dense(256, "relu")
        ])(layers.Concatenate()([answer1, answer2, answer3]))

        answer4 = Sequential([
            layers.BatchNormalization(),
            layers.Dense(
                256,
                kernel_initializer=tf.keras.initializers.lecun_normal(seed=s),
                activation='selu',
                name='last_frozen'),
            layers.BatchNormalization(),
            layers.Dense(
                206,
                kernel_initializer=tf.keras.initializers.lecun_normal(seed=s),
                activation='selu')
        ])(layers.Concatenate()([output1, answer2, answer3, answer3_]))

        # Scored Training #

        answer5 = Sequential(
            [layers.BatchNormalization(),
             layers.Dense(Y.shape[1], "sigmoid")])(answer4)

        m_nn = tf.keras.Model([input1_, input2_], answer5)

        m_nn.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                     loss=losses.BinaryCrossentropy(
                         label_smoothing=label_smoothing_alpha),
                     metrics=logloss)

        # Load final model
        m_nn.load_weights(f'{model_output_folder}/{file_name}_final.h5')

        # Generate Submission Prediction #
        fold_submit_preds = m_nn.predict([X_test_1, X_test_2],
                                         batch_size=batch_size)
        y_pred += fold_submit_preds / (KFOLDS * len(SEEDS))
        print(fold_submit_preds[:5, :])

        k += 1

        print('\n')

Inferencing on seed23 fold0 ......
[[0.00056139 0.00106284 0.00074208 ... 0.0017476  0.00190859 0.00198264]
 [0.00044496 0.00164593 0.0003279  ... 0.00054286 0.00064338 0.00151831]
 [0.00123659 0.00060897 0.00295358 ... 0.00730278 0.00293992 0.00370432]
 [0.00192729 0.00121335 0.00502865 ... 0.0034216  0.00514488 0.00325764]
 [0.00173937 0.00081508 0.00058252 ... 0.00229477 0.00017667 0.00084661]]


Inferencing on seed23 fold1 ......
[[0.00088986 0.00062576 0.00082023 ... 0.00344536 0.00368247 0.00334869]
 [0.00045452 0.00181322 0.003652   ... 0.00191084 0.01221672 0.00291771]
 [0.00087412 0.00036393 0.00089527 ... 0.00245913 0.00028768 0.00266225]
 [0.00072424 0.00023941 0.00276455 ... 0.00154375 0.00113876 0.00187372]
 [0.00134499 0.00050059 0.00070841 ... 0.00081508 0.00017123 0.00056079]]


Inferencing on seed23 fold2 ......
[[0.00027126 0.00026722 0.00301203 ... 0.00290554 0.01218982 0.00088345]
 [0.00071893 0.00058022 0.00054488 ... 0.00183034 0.00128198 0.00550056]
 [0.00060586 

[[6.6851306e-04 6.8165292e-04 4.5393365e-03 ... 2.8555575e-03
  4.1142199e-03 1.7234654e-03]
 [2.0515923e-04 4.1247279e-04 5.2264961e-04 ... 5.7782239e-04
  1.4366378e-03 6.2748429e-04]
 [9.3984802e-04 3.9600043e-04 2.6818949e-03 ... 1.3533479e-02
  3.2255857e-04 3.7169382e-03]
 [9.9689921e-04 5.7448249e-04 1.1608492e-03 ... 3.8452810e-03
  4.7078604e-04 3.4585553e-03]
 [5.6362241e-03 6.0083214e-03 8.7874208e-04 ... 1.2059963e-03
  8.6105363e-05 2.8704066e-04]]


Inferencing on seed1488 fold1 ......
[[0.00069824 0.00220826 0.00139089 ... 0.00191868 0.0035075  0.00073152]
 [0.00046635 0.00061942 0.00072947 ... 0.00021044 0.01261906 0.00305229]
 [0.00025992 0.00059526 0.00925999 ... 0.00485722 0.00738091 0.00548732]
 [0.00063595 0.00066471 0.00488478 ... 0.00213963 0.00222874 0.00407957]
 [0.00033608 0.00029401 0.00106942 ... 0.00107173 0.00017009 0.00033358]]


Inferencing on seed1488 fold2 ......
[[2.4655586e-04 7.9634361e-04 9.9731958e-04 ... 3.4513258e-04
  6.4332742e-04 2.9010139e-0

[[0.00053387 0.00098781 0.00116067 ... 0.00143225 0.00290785 0.00157618]
 [0.00019077 0.00045143 0.00053434 ... 0.00036127 0.0036521  0.00434996]
 [0.00026985 0.00058486 0.00089723 ... 0.00265015 0.00080359 0.00400145]
 [0.0003048  0.0003757  0.00094874 ... 0.00070652 0.00034327 0.00227137]
 [0.00063622 0.00125182 0.00199942 ... 0.00094559 0.00010069 0.0004768 ]]


Inferencing on seed2208 fold0 ......
[[1.8029681e-03 1.4691623e-03 1.8985847e-03 ... 1.5083007e-03
  5.5724983e-03 7.2043939e-03]
 [2.1021471e-05 4.9318969e-05 3.6021913e-04 ... 5.2477084e-05
  5.3349574e-04 1.6149454e-03]
 [7.6167600e-04 3.0411969e-04 4.8203641e-03 ... 1.3832078e-02
  1.1352553e-03 1.9509779e-02]
 [6.4497150e-04 5.5810797e-04 8.5345404e-03 ... 5.7523758e-03
  3.4632967e-03 3.3110438e-03]
 [1.7337831e-03 1.8423384e-03 4.2355014e-03 ... 8.7277626e-04
  2.7134601e-04 4.5802942e-04]]


Inferencing on seed2208 fold1 ......
[[5.3902116e-04 3.6198512e-04 3.1842699e-04 ... 2.6404322e-03
  1.6169139e-03 2.9013597e-0

[[0.00033101 0.00026384 0.00075961 ... 0.002214   0.01034177 0.00142648]
 [0.00036337 0.00047175 0.00035107 ... 0.00023396 0.00023206 0.00736987]
 [0.00046579 0.00015604 0.00178421 ... 0.00523895 0.00100294 0.00816562]
 [0.00037205 0.00061859 0.00141    ... 0.0008066  0.00390044 0.00737727]
 [0.00047332 0.00078267 0.00108487 ... 0.00159876 0.00029815 0.00061256]]


Inferencing on seed2077 fold8 ......
[[0.00036644 0.00040903 0.00126464 ... 0.00167799 0.02209555 0.00535716]
 [0.00027572 0.00362468 0.00175643 ... 0.00233876 0.00345927 0.00331694]
 [0.00041548 0.00030213 0.00634272 ... 0.00756555 0.0023473  0.00175078]
 [0.00145607 0.00078461 0.00142282 ... 0.00254556 0.00061528 0.00213723]
 [0.0013206  0.00077488 0.00045527 ... 0.00053492 0.00023741 0.00053314]]


Inferencing on seed2077 fold9 ......
[[4.6204918e-04 4.0566101e-04 9.0347091e-04 ... 1.5603713e-03
  7.3685456e-04 8.8365277e-04]
 [2.2692404e-05 2.5759250e-04 2.9870594e-04 ... 2.0287532e-04
  5.6445841e-03 9.3486282e-04]
 [2.

In [16]:
oof_predictions = glob.glob(f'{model_output_folder}/oof_*.npy')[0]
oof_predictions = np.load(oof_predictions)

oof_loss = mean_logloss(oof_predictions, Y)
print(f"OOF Validation Loss: {oof_loss:.6f}")

OOF Validation Loss: 0.015663


# Submission

In [17]:
sub.iloc[:, 1:] = y_pred
# sub.iloc[:, 1:] = np.clip(y_pred, P_MIN, P_MAX)

In [18]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000616,0.000670,0.001486,0.023087,0.026904,0.005376,0.002990,0.006178,0.000376,...,0.001063,0.000963,0.003477,0.001648,0.000689,0.000550,0.001247,0.002296,0.005124,0.002131
1,id_001897cda,0.000409,0.000961,0.001425,0.001379,0.001016,0.001391,0.003094,0.007824,0.023714,...,0.000577,0.001569,0.002564,0.000390,0.008280,0.000760,0.005528,0.000841,0.004731,0.002577
2,id_002429b5b,0.000481,0.000424,0.003661,0.029048,0.028302,0.003280,0.005138,0.005076,0.000527,...,0.001623,0.001026,0.005613,0.004521,0.003523,0.000483,0.002315,0.005995,0.001897,0.004294
3,id_00276f245,0.000644,0.000626,0.002182,0.018781,0.014798,0.004733,0.002363,0.004706,0.000461,...,0.000753,0.001479,0.003489,0.029556,0.010127,0.000629,0.001751,0.002487,0.001767,0.003340
4,id_0027f1083,0.001700,0.001558,0.001865,0.016128,0.022735,0.004241,0.005818,0.001415,0.000340,...,0.000788,0.000734,0.004384,0.002019,0.001006,0.000663,0.000917,0.001734,0.000353,0.000995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000651,0.000989,0.001325,0.005086,0.005746,0.002187,0.001282,0.004126,0.000934,...,0.000650,0.004573,0.002626,0.199465,0.006235,0.000890,0.002479,0.001064,0.001935,0.001354
3978,id_ff925dd0d,0.005284,0.003284,0.000723,0.006674,0.029155,0.006869,0.005398,0.003113,0.000603,...,0.000469,0.000622,0.002790,0.002515,0.001234,0.001080,0.001667,0.001572,0.000381,0.001325
3979,id_ffb710450,0.001600,0.000940,0.000851,0.010382,0.035113,0.005727,0.004069,0.002913,0.000201,...,0.000421,0.000525,0.001822,0.002054,0.001125,0.000510,0.000613,0.001318,0.000410,0.001139
3980,id_ffbb869f2,0.001781,0.001276,0.001148,0.022132,0.033345,0.006861,0.005096,0.002018,0.000320,...,0.000399,0.000414,0.003382,0.001434,0.001137,0.000490,0.001052,0.001652,0.000440,0.002003


In [19]:
# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle', 1:] = 0

# Save Submission
# sub.to_csv('submission_improving-mark-s-2-heads-model.csv', index=False)
sub.to_csv('submission.csv', index=False)

In [20]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000616,0.000670,0.001486,0.023087,0.026904,0.005376,0.002990,0.006178,0.000376,...,0.001063,0.000963,0.003477,0.001648,0.000689,0.000550,0.001247,0.002296,0.005124,0.002131
1,id_001897cda,0.000409,0.000961,0.001425,0.001379,0.001016,0.001391,0.003094,0.007824,0.023714,...,0.000577,0.001569,0.002564,0.000390,0.008280,0.000760,0.005528,0.000841,0.004731,0.002577
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.000644,0.000626,0.002182,0.018781,0.014798,0.004733,0.002363,0.004706,0.000461,...,0.000753,0.001479,0.003489,0.029556,0.010127,0.000629,0.001751,0.002487,0.001767,0.003340
4,id_0027f1083,0.001700,0.001558,0.001865,0.016128,0.022735,0.004241,0.005818,0.001415,0.000340,...,0.000788,0.000734,0.004384,0.002019,0.001006,0.000663,0.000917,0.001734,0.000353,0.000995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000651,0.000989,0.001325,0.005086,0.005746,0.002187,0.001282,0.004126,0.000934,...,0.000650,0.004573,0.002626,0.199465,0.006235,0.000890,0.002479,0.001064,0.001935,0.001354
3978,id_ff925dd0d,0.005284,0.003284,0.000723,0.006674,0.029155,0.006869,0.005398,0.003113,0.000603,...,0.000469,0.000622,0.002790,0.002515,0.001234,0.001080,0.001667,0.001572,0.000381,0.001325
3979,id_ffb710450,0.001600,0.000940,0.000851,0.010382,0.035113,0.005727,0.004069,0.002913,0.000201,...,0.000421,0.000525,0.001822,0.002054,0.001125,0.000510,0.000613,0.001318,0.000410,0.001139
3980,id_ffbb869f2,0.001781,0.001276,0.001148,0.022132,0.033345,0.006861,0.005096,0.002018,0.000320,...,0.000399,0.000414,0.003382,0.001434,0.001137,0.000490,0.001052,0.001652,0.000440,0.002003
