In [1]:
# Reference:
# https://www.kaggle.com/demetrypascal/fork-of-2heads-looper-super-puper-plate/notebook

kernel_mode = True

# Preparations

Let’s load the packages and provide some constants for our script:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from tensorflow.keras import layers, regularizers, Sequential, Model, backend, callbacks, optimizers, metrics, losses
import tensorflow as tf
import sys
import os
import random
import json
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import pickle
from pickle import dump, load
import glob

import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA"
model_output_folder = "../input/2heads-looper-super-puper-markpeng" if kernel_mode \
    else f"{PATH}/2heads-looper-super-puper"
os.makedirs(model_output_folder, exist_ok=True)

# SEEDS = [23]
SEEDS = [23, 228, 1488, 1998, 2208, 2077, 404]
KFOLDS = 10

batch_size = 256
# batch_size = 128

label_smoothing_alpha = 0.0005

P_MIN = label_smoothing_alpha
P_MAX = 1 - P_MIN

In [4]:
# Import train data, drop sig_id, cp_type
train_features = pd.read_csv(f'{PATH}/train_features.csv')

non_ctl_idx = train_features.loc[
    train_features['cp_type'] != 'ctl_vehicle'].index.to_list()

# Drop training data with ctl vehicle
tr = train_features.iloc[non_ctl_idx, :].reset_index(drop=True)

test_features = pd.read_csv(f'{PATH}/test_features.csv')
te = test_features.copy()

In [5]:
train_targets_scored = pd.read_csv(f'{PATH}/train_targets_scored.csv')
Y = train_targets_scored.drop('sig_id', axis=1)
Y = Y.iloc[non_ctl_idx, :].copy().reset_index(drop=True).values

train_targets_nonscored = pd.read_csv(f'{PATH}/train_targets_nonscored.csv')
Y0 = train_targets_nonscored.drop('sig_id', axis=1)
Y0 = Y0.iloc[non_ctl_idx, :].copy().reset_index(drop=True).values

sub = pd.read_csv(f'{PATH}/sample_submission.csv')
sub.iloc[:, 1:] = 0

# Features from t.test

Here I am getting most important predictors

In [6]:
# Import predictors from public kernel
json_file_path = '../input/t-test-pca-rfe-logistic-regression/main_predictors.json' if kernel_mode \
    else "/workspace/Kaggle/MoA/t-test-pca-rfe-logistic-regression/main_predictors.json"

with open(json_file_path, 'r') as j:
    predictors = json.loads(j.read())
    predictors = predictors['start_predictors']

In [7]:
second_Xtrain = tr[predictors].copy().values

second_Xtest = te[predictors].copy().values
second_Xtrain.shape

(21948, 447)

# Keras model

I got idea of **label smoothing** from this notebook: https://www.kaggle.com/kailex/moa-transfer-recipe-with-smoothing

In [8]:
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, P_MIN, P_MAX)
    return -backend.mean(y_true * backend.log(y_pred) +
                         (1 - y_true) * backend.log(1 - y_pred))

# Inference

In [9]:
numeric_features = [c for c in train_features.columns if c != "sig_id"]
gene_experssion_features = [c for c in numeric_features if c.startswith("g-")]
cell_viability_features = [c for c in numeric_features if c.startswith("c-")]
len(gene_experssion_features), len(cell_viability_features)

(772, 100)

In [10]:
tr = tr.drop(['sig_id', 'cp_type', 'cp_time', 'cp_dose'], axis=1)
te = test_features.drop(['sig_id', 'cp_type', 'cp_time', 'cp_dose'], axis=1)

In [11]:
def preprocessor_1(test, seed, scaler=None, pca_gs=None, pca_cs=None):
    n_gs = 2
    n_cs = 100

    # g-mean, c-mean
    test_g_mean = test[gene_experssion_features].mean(axis=1)

    test_c_mean = test[cell_viability_features].mean(axis=1)

    test_columns = test.columns.tolist()

    test = np.concatenate(
        (test, test_g_mean[:, np.newaxis], test_c_mean[:, np.newaxis]), axis=1)

    # Standard Scaler for Numerical Values
    test = pd.DataFrame(data=scaler.transform(test),
                        columns=test_columns + ["g_mean", "c_mean"])
    test_pca_gs = pca_gs.transform(test[gene_experssion_features].values)

    test_pca_cs = pca_cs.transform(test[cell_viability_features].values)

    # Append Features
    test = np.concatenate((test, test_pca_gs, test_pca_cs), axis=1)

    return test


def preprocessor_2(test, scaler=None):
    # Standard Scaler for Numerical Values
    test = scaler.transform(test)

    return test, scaler


def save_pickle(obj, model_output_folder, name):
    dump(obj, open(f"{model_output_folder}/{name}.pkl", 'wb'),
         pickle.HIGHEST_PROTOCOL)


def load_pickle(model_output_folder, name):
    return load(open(f"{model_output_folder}/{name}.pkl", 'rb'))


def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.mean(-logloss)

In [12]:
tr.shape, te.shape

((21948, 872), (3982, 872))

In [13]:
oof_predictions = np.zeros((tr.shape[0], Y.shape[1]))

y_pred = np.zeros((te.shape[0], 206))
for s in SEEDS:

    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)

    k = 0
    kf = KFold(n_splits=KFOLDS, shuffle=True, random_state=s)
    for train_index, valid_index in kf.split(tr):
        file_name = f"seed{s}_fold{k}"

        print(f"Inferencing on seed{s} fold{k} ......")

        scaler_1 = load_pickle(model_output_folder, f"{file_name}_scaler_1")
        pca_gs = load_pickle(model_output_folder, f"{file_name}_pca_gs")
        pca_cs = load_pickle(model_output_folder, f"{file_name}_pca_cs")
        X_test_1 = preprocessor_1(te, s, scaler_1, pca_gs, pca_cs)

        scaler_2 = load_pickle(model_output_folder, f"{file_name}_scaler_2")
        X_test_2, scaler_2 = preprocessor_2(second_Xtest, scaler_2)

        y_valid_1 = Y[valid_index, :]
        y_valid_2 = Y0[valid_index, :]

        n_features = X_test_1.shape[1]
        n_features_2 = X_test_2.shape[1]

        # Model Definition #

        input1_ = layers.Input(shape=(n_features, ))
        input2_ = layers.Input(shape=(n_features_2, ))

        output1 = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Dense(512, activation="elu"),
            layers.BatchNormalization(),
            layers.Dense(256, activation="elu")
        ])(input1_)

        answer1 = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(512, "relu")
        ])(layers.Concatenate()([output1, input2_]))

        answer2 = Sequential([
            layers.BatchNormalization(),
            layers.Dense(512, "elu"),
            layers.BatchNormalization(),
            layers.Dense(256, "relu")
        ])(layers.Concatenate()([output1, input2_, answer1]))

        answer3 = Sequential(
            [layers.BatchNormalization(),
             layers.Dense(256,
                          "elu")])(layers.Concatenate()([answer1, answer2]))

        answer3_ = Sequential([
            layers.BatchNormalization(),
            layers.Dense(256, "relu")
        ])(layers.Concatenate()([answer1, answer2, answer3]))

        answer4 = Sequential([
            layers.BatchNormalization(),
            layers.Dense(
                256,
                kernel_initializer=tf.keras.initializers.lecun_normal(seed=s),
                activation='selu',
                name='last_frozen'),
            layers.BatchNormalization(),
            layers.Dense(
                206,
                kernel_initializer=tf.keras.initializers.lecun_normal(seed=s),
                activation='selu')
        ])(layers.Concatenate()([output1, answer2, answer3, answer3_]))

        # Scored Training #

        answer5 = Sequential(
            [layers.BatchNormalization(),
             layers.Dense(Y.shape[1], "sigmoid")])(answer4)

        m_nn = tf.keras.Model([input1_, input2_], answer5)

        m_nn.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                     loss=losses.BinaryCrossentropy(
                         label_smoothing=label_smoothing_alpha),
                     metrics=logloss)

        # Load final model
        m_nn = tf.keras.models.load_model(
            f'{model_output_folder}/{file_name}_final.h5',
            custom_objects={'logloss': logloss})

        # Generate Submission Prediction #
        fold_submit_preds = m_nn.predict([X_test_1, X_test_2],
                                         batch_size=batch_size)
        y_pred += fold_submit_preds / (KFOLDS * len(SEEDS))
        print(fold_submit_preds[:5, :])

        k += 1

        print('\n')

Inferencing on seed23 fold0 ......
[[0.00062638 0.00139738 0.00256215 ... 0.00215133 0.00118474 0.00168681]
 [0.00035632 0.00282041 0.00062197 ... 0.00116035 0.00233351 0.00359165]
 [0.00111823 0.00116571 0.00234077 ... 0.00260893 0.00513931 0.00421351]
 [0.00030748 0.00038197 0.00049237 ... 0.00021168 0.0007948  0.00052682]
 [0.00435119 0.00289444 0.00288538 ... 0.00288126 0.0007065  0.00149353]]


Inferencing on seed23 fold1 ......
[[0.00043796 0.00087135 0.00177629 ... 0.0059837  0.00603775 0.00380701]
 [0.00099433 0.00191273 0.00115954 ... 0.00050997 0.01455158 0.00108468]
 [0.00072859 0.00028989 0.00163209 ... 0.00377872 0.00062831 0.00547929]
 [0.00150527 0.00081846 0.00254885 ... 0.00238689 0.00355374 0.00205896]
 [0.00068232 0.0011047  0.00187454 ... 0.00261347 0.00064067 0.00329006]]


Inferencing on seed23 fold2 ......
[[0.00078193 0.00042732 0.00251405 ... 0.00410442 0.00537843 0.00260135]
 [0.00041986 0.00105912 0.00221112 ... 0.00085451 0.00597084 0.00823114]
 [0.00225961 

[[0.00041637 0.00076366 0.00185131 ... 0.00074927 0.00080906 0.00062757]
 [0.00068896 0.00121188 0.00055628 ... 0.00127395 0.0001347  0.00816328]
 [0.0011636  0.00084701 0.00332346 ... 0.00686609 0.00115232 0.00409443]
 [0.00074673 0.00083443 0.00125869 ... 0.00120647 0.00032822 0.00361358]
 [0.00122167 0.00144062 0.00319215 ... 0.00267271 0.00139986 0.0015122 ]]


Inferencing on seed1488 fold1 ......
[[0.00053723 0.00158904 0.00302695 ... 0.00326372 0.00553973 0.00121735]
 [0.00055014 0.00126293 0.00050249 ... 0.00037031 0.00032316 0.00120134]
 [0.00172001 0.00094465 0.00868401 ... 0.0055094  0.00103914 0.00524251]
 [0.00197938 0.0008644  0.00155398 ... 0.00070333 0.00016051 0.00126356]
 [0.00192006 0.00174451 0.00206592 ... 0.00145532 0.00018488 0.00090535]]


Inferencing on seed1488 fold2 ......
[[0.00104297 0.00252677 0.00384524 ... 0.00262228 0.00304421 0.00041172]
 [0.00512925 0.00099094 0.00186214 ... 0.00074087 0.01512998 0.00092345]
 [0.00066793 0.00093589 0.00243816 ... 0.007

[[0.00204638 0.00113798 0.00063891 ... 0.00148389 0.00233765 0.00109445]
 [0.0019779  0.00084201 0.00062849 ... 0.00064207 0.0084047  0.00084796]
 [0.00120602 0.00119492 0.00185022 ... 0.00443862 0.00388472 0.00302889]
 [0.00219935 0.00347256 0.00602429 ... 0.00176903 0.00859255 0.00754905]
 [0.000933   0.00074738 0.00156016 ... 0.00177619 0.00116859 0.00111089]]


Inferencing on seed2208 fold1 ......
[[0.00090029 0.00042809 0.00075695 ... 0.00067307 0.00324794 0.00145146]
 [0.00031122 0.00011111 0.00053146 ... 0.00031583 0.00143036 0.00114015]
 [0.00158012 0.00060916 0.00282818 ... 0.00314002 0.01051789 0.00263162]
 [0.0010578  0.00051006 0.00168711 ... 0.00347083 0.00460128 0.00118598]
 [0.00219652 0.00173215 0.00210728 ... 0.00055012 0.00050442 0.00021246]]


Inferencing on seed2208 fold2 ......
[[0.00050057 0.00095515 0.0007046  ... 0.00070978 0.00438657 0.00101717]
 [0.00053551 0.00073265 0.00084407 ... 0.00039952 0.00197972 0.00155617]
 [0.00079254 0.00067668 0.00108984 ... 0.008

[[0.00557548 0.00195153 0.00248652 ... 0.00342366 0.00178767 0.0018387 ]
 [0.00081232 0.0019375  0.00098008 ... 0.00041475 0.00204593 0.00087728]
 [0.00061184 0.00046793 0.00204296 ... 0.00390025 0.00095449 0.00657001]
 [0.0014319  0.00282594 0.00390048 ... 0.00134808 0.00097792 0.00351321]
 [0.00083724 0.00269259 0.00250249 ... 0.00103952 0.000378   0.00402767]]


Inferencing on seed404 fold1 ......
[[0.00118746 0.00261698 0.00133018 ... 0.00185213 0.00260876 0.00411207]
 [0.00020632 0.0003831  0.00087567 ... 0.00031728 0.0105675  0.00103415]
 [0.00052676 0.0003387  0.00089782 ... 0.007139   0.00051989 0.00168848]
 [0.00036624 0.0004242  0.00162773 ... 0.00410049 0.00365159 0.00335143]
 [0.00422462 0.0033036  0.00275651 ... 0.00285165 0.00133041 0.00196757]]


Inferencing on seed404 fold2 ......
[[2.4624533e-04 6.4437889e-04 3.6888060e-03 ... 1.4494929e-03
  8.0141437e-04 9.5352222e-04]
 [2.6606789e-04 2.2537904e-03 6.5190170e-04 ... 7.0158038e-03
  7.8351488e-03 4.3963045e-03]
 [5.75

In [14]:
oof_predictions = glob.glob(f'{model_output_folder}/oof_*.npy')[0]
oof_predictions = np.load(oof_predictions)

oof_loss = mean_logloss(oof_predictions, Y)
print(f"OOF Validation Loss: {oof_loss:.6f}")

OOF Validation Loss: 0.015886


# Submission

In [15]:
sub.iloc[:, 1:] = y_pred
# sub.iloc[:, 1:] = np.clip(y_pred, P_MIN, P_MAX)

In [16]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001296,0.001144,0.001935,0.019420,0.020473,0.004704,0.003269,0.006187,0.000673,...,0.001393,0.001220,0.003299,0.001768,0.000743,0.001069,0.002160,0.002398,0.004096,0.002212
1,id_001897cda,0.000775,0.001519,0.001309,0.002160,0.001042,0.001716,0.004684,0.010775,0.022973,...,0.001309,0.002542,0.002806,0.001027,0.007790,0.001193,0.002594,0.001286,0.007403,0.003014
2,id_002429b5b,0.001044,0.000852,0.003124,0.022167,0.042731,0.003832,0.005513,0.004710,0.000989,...,0.001805,0.001317,0.004991,0.004447,0.003487,0.001143,0.002634,0.005157,0.002930,0.004254
3,id_00276f245,0.001252,0.001318,0.002660,0.016942,0.011872,0.004862,0.002630,0.004053,0.001070,...,0.001236,0.001678,0.003104,0.016909,0.011048,0.001360,0.002721,0.002255,0.002252,0.003410
4,id_0027f1083,0.001573,0.001310,0.002372,0.013725,0.017782,0.003375,0.006460,0.001758,0.000792,...,0.001204,0.000916,0.003667,0.002335,0.001016,0.001098,0.002159,0.002158,0.000675,0.001542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.001133,0.001604,0.001575,0.003932,0.004258,0.001658,0.001161,0.004470,0.001559,...,0.001198,0.005512,0.002485,0.188303,0.008189,0.001548,0.004673,0.001353,0.002685,0.001732
3978,id_ff925dd0d,0.004433,0.003341,0.001028,0.006062,0.024969,0.005391,0.004477,0.003098,0.000985,...,0.000838,0.001094,0.002212,0.002787,0.001303,0.001479,0.002179,0.001565,0.000748,0.001460
3979,id_ffb710450,0.001691,0.001066,0.001161,0.009116,0.039435,0.005469,0.003519,0.003358,0.000531,...,0.000731,0.000836,0.001756,0.002104,0.001223,0.000862,0.000872,0.001473,0.000705,0.001430
3980,id_ffbb869f2,0.001530,0.001476,0.001282,0.020892,0.029953,0.006217,0.004849,0.002568,0.000717,...,0.000683,0.000558,0.002887,0.001691,0.001306,0.000866,0.001247,0.001579,0.000872,0.002381


In [17]:
# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle', 1:] = 0

# Save Submission
sub.to_csv('submission_2heads-looper-super-puper.csv', index=False)
# sub.to_csv('submission.csv', index=False)

In [18]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001296,0.001144,0.001935,0.019420,0.020473,0.004704,0.003269,0.006187,0.000673,...,0.001393,0.001220,0.003299,0.001768,0.000743,0.001069,0.002160,0.002398,0.004096,0.002212
1,id_001897cda,0.000775,0.001519,0.001309,0.002160,0.001042,0.001716,0.004684,0.010775,0.022973,...,0.001309,0.002542,0.002806,0.001027,0.007790,0.001193,0.002594,0.001286,0.007403,0.003014
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.001252,0.001318,0.002660,0.016942,0.011872,0.004862,0.002630,0.004053,0.001070,...,0.001236,0.001678,0.003104,0.016909,0.011048,0.001360,0.002721,0.002255,0.002252,0.003410
4,id_0027f1083,0.001573,0.001310,0.002372,0.013725,0.017782,0.003375,0.006460,0.001758,0.000792,...,0.001204,0.000916,0.003667,0.002335,0.001016,0.001098,0.002159,0.002158,0.000675,0.001542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.001133,0.001604,0.001575,0.003932,0.004258,0.001658,0.001161,0.004470,0.001559,...,0.001198,0.005512,0.002485,0.188303,0.008189,0.001548,0.004673,0.001353,0.002685,0.001732
3978,id_ff925dd0d,0.004433,0.003341,0.001028,0.006062,0.024969,0.005391,0.004477,0.003098,0.000985,...,0.000838,0.001094,0.002212,0.002787,0.001303,0.001479,0.002179,0.001565,0.000748,0.001460
3979,id_ffb710450,0.001691,0.001066,0.001161,0.009116,0.039435,0.005469,0.003519,0.003358,0.000531,...,0.000731,0.000836,0.001756,0.002104,0.001223,0.000862,0.000872,0.001473,0.000705,0.001430
3980,id_ffbb869f2,0.001530,0.001476,0.001282,0.020892,0.029953,0.006217,0.004849,0.002568,0.000717,...,0.000683,0.000558,0.002887,0.001691,0.001306,0.000866,0.001247,0.001579,0.000872,0.002381
