- a notebook to save preprocessing model and train/save NN models
- all necessary ouputs are stored in MODEL_DIR = output/kaggle/working/model
    - put those into dataset, and load it from inference notebook

In [1]:
kernel_mode = False
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import sys
if kernel_mode:
    sys.path.append(
        '../input/iterative-stratification/iterative-stratification-master')
    sys.path.append('../input/umaplearn/umap')

%mkdir model
%mkdir interim

from scipy.sparse.csgraph import connected_components
from umap import UMAP
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
import time

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print(torch.cuda.is_available())
import warnings
# warnings.filterwarnings('ignore')

mkdir: cannot create directory ‘model’: File exists
mkdir: cannot create directory ‘interim’: File exists
True


In [3]:
torch.__version__

'1.6.0+cu101'

In [4]:
dataset_folder = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA"
model_output_folder = "../input/kibuna-nn-hs-1024-last-train-markpeng" if kernel_mode \
    else f"{dataset_folder}/kibuna-nn-hs-1024-last-train-markpeng"
BATCH_SIZE = 256
INFER_BATCH_SIZE = 512

In [5]:
NB = '25'

IS_TRAIN = True
MODEL_DIR = f"{model_output_folder}/model"
INT_DIR = f"{model_output_folder}/interim"

if IS_TRAIN:
    os.makedirs(model_output_folder, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(INT_DIR, exist_ok=True)

NSEEDS = 5  # 5
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 15
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

NFOLDS = 5  # 5

PMIN = 0.0005
PMAX = 0.9995
SMIN = 0.0
SMAX = 1.0

In [6]:
train_features = pd.read_csv(f'{dataset_folder}/train_features.csv')
train_targets_scored = pd.read_csv(f'{dataset_folder}/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(f'{dataset_folder}/train_targets_nonscored.csv')

test_features = pd.read_csv(f'{dataset_folder}/test_features.csv')
sample_submission = pd.read_csv(f'{dataset_folder}/sample_submission.csv')

In [7]:
train_targets_nonscored = train_targets_nonscored.loc[:, train_targets_nonscored.sum() != 0]
print(train_targets_nonscored.shape)

(23814, 332)


In [8]:
# for c in train_targets_scored.columns:
#     if c != "sig_id":
#         train_targets_scored[c] = np.maximum(PMIN, np.minimum(PMAX, train_targets_scored[c]))
for c in train_targets_nonscored.columns:
    if c != "sig_id":
        train_targets_nonscored[c] = np.maximum(
            PMIN, np.minimum(PMAX, train_targets_nonscored[c]))

In [9]:
print("(nsamples, nfeatures)")
print(train_features.shape)
print(train_targets_scored.shape)
print(train_targets_nonscored.shape)
print(test_features.shape)
print(sample_submission.shape)

(nsamples, nfeatures)
(23814, 876)
(23814, 207)
(23814, 332)
(3982, 876)
(3982, 207)


In [10]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [11]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything(seed=1903)

In [12]:
# GENES
n_comp = 90
n_dim = 45

data = pd.concat(
    [pd.DataFrame(train_features[GENES]),
     pd.DataFrame(test_features[GENES])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp,
                        random_state=1903).fit(data[GENES])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[GENES])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_g.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_g.pkl')

data2 = (fa.transform(data[GENES]))
data3 = (umap.transform(data[GENES]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_G-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_G-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_G-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_G-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

#CELLS
n_comp = 50
n_dim = 25

data = pd.concat(
    [pd.DataFrame(train_features[CELLS]),
     pd.DataFrame(test_features[CELLS])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp,
                        random_state=1903).fit(data[CELLS])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[CELLS])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_c.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_c.pkl')

data2 = (fa.transform(data[CELLS]))
data3 = (umap.fit_transform(data[CELLS]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_C-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_C-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_C-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_C-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]

In [13]:
from sklearn.preprocessing import QuantileTransformer

for col in (GENES + CELLS):
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = pd.concat([train_features, test_features
                         ])[col].values.reshape(vec_len + vec_len_test, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=123,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')
    else:
        transformer = pd.read_pickle(
            f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')

    train_features[col] = transformer.transform(
        train_features[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(
        test_features[col].values.reshape(vec_len_test,
                                          1)).reshape(1, vec_len_test)[0]

In [14]:
# PCAS = [col for col in train_features.columns if col.startswith('pca_')]
# UMAPS = [col for col in train_features.columns if col.startswith('umap_')]

In [15]:
# from sklearn.preprocessing import PolynomialFeatures
# n_deg = 2

# data = pd.concat([pd.DataFrame(train_features[PCAS]), pd.DataFrame(test_features[PCAS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[PCAS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)


# data = pd.concat([pd.DataFrame(train_features[UMAPS]), pd.DataFrame(test_features[UMAPS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[UMAPS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)

In [16]:
print(train_features.shape)
print(test_features.shape)

(23814, 1086)
(3982, 1086)


In [17]:
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train_features.merge(train_targets_nonscored, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(
    drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_nonscored.columns]

In [18]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [19]:
print(target.shape)
print(train_features.shape)
print(test_features.shape)
print(train.shape)
print(test.shape)

(21948, 332)
(23814, 1086)
(3982, 1086)
(21948, 1416)
(3624, 1085)


In [20]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [21]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,vasopressin_receptor_antagonist,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,xanthine_oxidase_inhibitor,xiap_inhibitor,kfold
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [22]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1416)
(21948, 1417)
(3624, 1085)
(21948, 332)
(3982, 207)


In [23]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float),
            'y': torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        return dct


class TestDataset:
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {'x': torch.tensor(self.features[idx, :], dtype=torch.float)}
        return dct

In [24]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        #         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []

    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())

    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)

    return final_loss, valid_preds


def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.sigmoid().detach().cpu().numpy())

    preds = np.concatenate(preds)

    return preds

In [25]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.15)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features,
                                                     hidden_size))

        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.3)
        self.dense2 = nn.Linear(hidden_size, hidden_size)

        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.dense1(x))

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

In [26]:
def process_data(data):

    data = pd.get_dummies(data, columns=['cp_time', 'cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [27]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id']]
len(feature_cols)

1087

In [28]:
num_features = len(feature_cols)
num_targets = len(target_cols)
hidden_size = 2048
# hidden_size=4096
# hidden_size=9192

In [29]:
def run_training(fold, seed):

    seed_everything(seed)

    train = process_data(folds)
    test_ = process_data(test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=INFER_BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()

    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"{MODEL_DIR}/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=INFER_BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.load_state_dict(
        torch.load(f"{MODEL_DIR}/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [30]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [31]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

print(oof.shape)
print(predictions.shape)

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.6458044633053351
SEED: 940, FOLD: 0, EPOCH: 0, valid_loss: 0.23897353808085123
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.03576180375302616
SEED: 940, FOLD: 0, EPOCH: 1, valid_loss: 0.009251730516552925
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.00939711653020071
SEED: 940, FOLD: 0, EPOCH: 2, valid_loss: 0.009194845540655984
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.00933286329002484
SEED: 940, FOLD: 0, EPOCH: 3, valid_loss: 0.009400543756783009
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.009148614589070929
SEED: 940, FOLD: 0, EPOCH: 4, valid_loss: 0.009084091625279851
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.008975854771130758
SEED: 940, FOLD: 0, EPOCH: 5, valid_loss: 0.009017287960482968
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.009263736951718296
SEED: 940, FOLD: 0, EPOCH: 6, valid_loss: 0.010180441559188895
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.009065507411740828
SEED: 940, FOLD: 0, EPOCH: 7, valid_loss: 0.00897248637759023

SEED: 940, FOLD: 4, EPOCH: 6, train_loss: 0.008952772339293058
SEED: 940, FOLD: 4, EPOCH: 6, valid_loss: 0.008799775917496946
SEED: 940, FOLD: 4, EPOCH: 7, train_loss: 0.008867254244514566
SEED: 940, FOLD: 4, EPOCH: 7, valid_loss: 0.008773985422319837
SEED: 940, FOLD: 4, EPOCH: 8, train_loss: 0.008817838045997896
SEED: 940, FOLD: 4, EPOCH: 8, valid_loss: 0.008596631698310375
SEED: 940, FOLD: 4, EPOCH: 9, train_loss: 0.008740798651200274
SEED: 940, FOLD: 4, EPOCH: 9, valid_loss: 0.008562602930598788
SEED: 940, FOLD: 4, EPOCH: 10, train_loss: 0.00861561261927304
SEED: 940, FOLD: 4, EPOCH: 10, valid_loss: 0.00852917641815212
SEED: 940, FOLD: 4, EPOCH: 11, train_loss: 0.00849586079383026
SEED: 940, FOLD: 4, EPOCH: 11, valid_loss: 0.008476049225363467
SEED: 940, FOLD: 4, EPOCH: 12, train_loss: 0.008363409399770308
SEED: 940, FOLD: 4, EPOCH: 12, valid_loss: 0.00843964951733748
SEED: 940, FOLD: 4, EPOCH: 13, train_loss: 0.008220767467350199
SEED: 940, FOLD: 4, EPOCH: 13, valid_loss: 0.0084201

SEED: 1513, FOLD: 3, EPOCH: 10, train_loss: 0.008510033444811901
SEED: 1513, FOLD: 3, EPOCH: 10, valid_loss: 0.008998410672777228
SEED: 1513, FOLD: 3, EPOCH: 11, train_loss: 0.008380469832353401
SEED: 1513, FOLD: 3, EPOCH: 11, valid_loss: 0.008958581317630079
SEED: 1513, FOLD: 3, EPOCH: 12, train_loss: 0.008260354055496662
SEED: 1513, FOLD: 3, EPOCH: 12, valid_loss: 0.008916081446740363
SEED: 1513, FOLD: 3, EPOCH: 13, train_loss: 0.008100920593015093
SEED: 1513, FOLD: 3, EPOCH: 13, valid_loss: 0.008896990161803033
SEED: 1513, FOLD: 3, EPOCH: 14, train_loss: 0.008023450394039568
SEED: 1513, FOLD: 3, EPOCH: 14, valid_loss: 0.008887458696133561
SEED: 1513, FOLD: 4, EPOCH: 0, train_loss: 0.6479944033899169
SEED: 1513, FOLD: 4, EPOCH: 0, valid_loss: 0.25152041845851475
SEED: 1513, FOLD: 4, EPOCH: 1, train_loss: 0.03594132327893074
SEED: 1513, FOLD: 4, EPOCH: 1, valid_loss: 0.009253945408595933
SEED: 1513, FOLD: 4, EPOCH: 2, train_loss: 0.009558766661886719
SEED: 1513, FOLD: 4, EPOCH: 2, val

SEED: 1269, FOLD: 2, EPOCH: 14, train_loss: 0.008013222751248142
SEED: 1269, FOLD: 2, EPOCH: 14, valid_loss: 0.00871849525719881
SEED: 1269, FOLD: 3, EPOCH: 0, train_loss: 0.6461189335239106
SEED: 1269, FOLD: 3, EPOCH: 0, valid_loss: 0.263225300444497
SEED: 1269, FOLD: 3, EPOCH: 1, train_loss: 0.03637226460420567
SEED: 1269, FOLD: 3, EPOCH: 1, valid_loss: 0.009531344597538313
SEED: 1269, FOLD: 3, EPOCH: 2, train_loss: 0.00934063554134058
SEED: 1269, FOLD: 3, EPOCH: 2, valid_loss: 0.009372986128760709
SEED: 1269, FOLD: 3, EPOCH: 3, train_loss: 0.009236673419566258
SEED: 1269, FOLD: 3, EPOCH: 3, valid_loss: 0.009518314877318012
SEED: 1269, FOLD: 3, EPOCH: 4, train_loss: 0.009156833254340767
SEED: 1269, FOLD: 3, EPOCH: 4, valid_loss: 0.009590136922068067
SEED: 1269, FOLD: 3, EPOCH: 5, train_loss: 0.009292908330056547
SEED: 1269, FOLD: 3, EPOCH: 5, valid_loss: 0.009361680907507738
SEED: 1269, FOLD: 3, EPOCH: 6, train_loss: 0.008883225367121075
SEED: 1269, FOLD: 3, EPOCH: 6, valid_loss: 0.0

SEED: 1392, FOLD: 2, EPOCH: 3, train_loss: 0.009239957532912924
SEED: 1392, FOLD: 2, EPOCH: 3, valid_loss: 0.009387786707116498
SEED: 1392, FOLD: 2, EPOCH: 4, train_loss: 0.00951617966959442
SEED: 1392, FOLD: 2, EPOCH: 4, valid_loss: 0.00919986381712887
SEED: 1392, FOLD: 2, EPOCH: 5, train_loss: 0.00897587062386067
SEED: 1392, FOLD: 2, EPOCH: 5, valid_loss: 0.00910305707818932
SEED: 1392, FOLD: 2, EPOCH: 6, train_loss: 0.008860149015874966
SEED: 1392, FOLD: 2, EPOCH: 6, valid_loss: 0.009136162905229462
SEED: 1392, FOLD: 2, EPOCH: 7, train_loss: 0.008779163472354412
SEED: 1392, FOLD: 2, EPOCH: 7, valid_loss: 0.008994622776905695
SEED: 1392, FOLD: 2, EPOCH: 8, train_loss: 0.00871269326603067
SEED: 1392, FOLD: 2, EPOCH: 8, valid_loss: 0.008904378861188889
SEED: 1392, FOLD: 2, EPOCH: 9, train_loss: 0.008648195276981678
SEED: 1392, FOLD: 2, EPOCH: 9, valid_loss: 0.008891833014786243
SEED: 1392, FOLD: 2, EPOCH: 10, train_loss: 0.00856969038537447
SEED: 1392, FOLD: 2, EPOCH: 10, valid_loss: 0

SEED: 1119, FOLD: 1, EPOCH: 7, train_loss: 0.00877867470346931
SEED: 1119, FOLD: 1, EPOCH: 7, valid_loss: 0.009343893267214298
SEED: 1119, FOLD: 1, EPOCH: 8, train_loss: 0.008747352590865416
SEED: 1119, FOLD: 1, EPOCH: 8, valid_loss: 0.008880674735539489
SEED: 1119, FOLD: 1, EPOCH: 9, train_loss: 0.008638572487710178
SEED: 1119, FOLD: 1, EPOCH: 9, valid_loss: 0.008848061060739888
SEED: 1119, FOLD: 1, EPOCH: 10, train_loss: 0.008548322066232779
SEED: 1119, FOLD: 1, EPOCH: 10, valid_loss: 0.008797477930784225
SEED: 1119, FOLD: 1, EPOCH: 11, train_loss: 0.008415683300868757
SEED: 1119, FOLD: 1, EPOCH: 11, valid_loss: 0.008750847230354944
SEED: 1119, FOLD: 1, EPOCH: 12, train_loss: 0.008268682772050734
SEED: 1119, FOLD: 1, EPOCH: 12, valid_loss: 0.008690587865809599
SEED: 1119, FOLD: 1, EPOCH: 13, train_loss: 0.008126122229125189
SEED: 1119, FOLD: 1, EPOCH: 13, valid_loss: 0.00867564096632931
SEED: 1119, FOLD: 1, EPOCH: 14, train_loss: 0.008021276626411987
SEED: 1119, FOLD: 1, EPOCH: 14, v

SEED: 1303, FOLD: 0, EPOCH: 11, train_loss: 0.008439547746725704
SEED: 1303, FOLD: 0, EPOCH: 11, valid_loss: 0.008684808181391822
SEED: 1303, FOLD: 0, EPOCH: 12, train_loss: 0.0082702806043992
SEED: 1303, FOLD: 0, EPOCH: 12, valid_loss: 0.008689545922809176
SEED: 1303, FOLD: 0, EPOCH: 13, train_loss: 0.008129751728172752
SEED: 1303, FOLD: 0, EPOCH: 13, valid_loss: 0.008669589646160603
SEED: 1303, FOLD: 0, EPOCH: 14, train_loss: 0.008025898847836947
SEED: 1303, FOLD: 0, EPOCH: 14, valid_loss: 0.00866048244966401
SEED: 1303, FOLD: 1, EPOCH: 0, train_loss: 0.6479445661323658
SEED: 1303, FOLD: 1, EPOCH: 0, valid_loss: 0.21327653527259827
SEED: 1303, FOLD: 1, EPOCH: 1, train_loss: 0.03589669615030289
SEED: 1303, FOLD: 1, EPOCH: 1, valid_loss: 0.009572753372291723
SEED: 1303, FOLD: 1, EPOCH: 2, train_loss: 0.009462039728743443
SEED: 1303, FOLD: 1, EPOCH: 2, valid_loss: 0.00924259444905652
SEED: 1303, FOLD: 1, EPOCH: 3, train_loss: 0.009233669809781122
SEED: 1303, FOLD: 1, EPOCH: 3, valid_los

elapsed time: 398.201021194458
(21948, 331)
(3624, 331)


In [32]:
train.to_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [33]:
len(target_cols)

331

In [34]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_nonscored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_nonscored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

CV log_loss:  0.004835599416713842


CV log_loss:  0.014761779358699672
CV log_loss:  0.014519859174255039
CV log_loss:  0.014525173864593479
CV log_loss:  0.014354930596928602 # 3 umap features
CV log_loss:  0.014353604854355429 # more umap features
CV log_loss:  0.01436484670778641 # more hidden nodes

In [35]:
EPOCHS = 25
# NFOLDS = 5

In [36]:
# sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
# sub.to_csv('submission.csv', index=False)

In [37]:
nonscored_target = [
    c for c in train[train_targets_nonscored.columns] if c != "sig_id"
]

In [38]:
nonscored_target

['abc_transporter_expression_enhancer',
 'abl_inhibitor',
 'ace_inhibitor',
 'acetylcholine_release_enhancer',
 'adenosine_kinase_inhibitor',
 'adenylyl_cyclase_inhibitor',
 'age_inhibitor',
 'alcohol_dehydrogenase_inhibitor',
 'aldehyde_dehydrogenase_activator',
 'aldose_reductase_inhibitor',
 'ampk_inhibitor',
 'androgen_biosynthesis_inhibitor',
 'angiotensin_receptor_agonist',
 'antacid',
 'anthelmintic',
 'antipruritic',
 'antirheumatic_drug',
 'antiseptic',
 'antispasmodic',
 'antithyroid_agent',
 'antitussive',
 'anxiolytic',
 'ap_inhibitor',
 'apoptosis_inhibitor',
 'arf_inhibitor',
 'aryl_hydrocarbon_receptor_agonist',
 'aryl_hydrocarbon_receptor_antagonist',
 'aspartic_protease_inhibitor',
 'atherogenesis_inhibitor',
 'atherosclerosis_formation_inhibitor',
 'atp-sensitive_potassium_channel_agonist',
 'atp-sensitive_potassium_channel_inhibitor',
 'atp_channel_blocker',
 'atp_citrase_lyase_inhibitor',
 'autophagy_inducer',
 'axl_kinase_inhibitor',
 'bacterial_atpase_inhibitor',


In [39]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [40]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [41]:
# from sklearn.preprocessing import QuantileTransformer

for col in (nonscored_target):

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")
    else:
        transformer = pd.read_pickle(
            f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(
        vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [42]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [43]:
train

Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0,0,0,0,0,0,0,0,0,0
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0,0,0,0,0,0,0,0,0,0
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0,0,0,0,0,0,0,0,0,0
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0,0,0,0,0,0,0,0,0,0


In [44]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0,0,0,0,0,0,0,0,0,4
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0,0,0,0,0,0,0,0,0,4
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0,0,0,0,0,0,0,0,0,4
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0,0,0,0,0,0,0,0,0,4
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0,0,0,0,0,0,0,0,0,4
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0,0,0,0,0,0,0,0,0,0
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0,0,0,0,0,0,0,0,0,1
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0,0,0,0,0,0,0,0,0,1


In [45]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1622)
(21948, 1623)
(3624, 1416)
(21948, 207)
(3982, 207)


In [46]:
def process_data(data):

    data = pd.get_dummies(data, columns=['cp_time', 'cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [47]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

1418

In [48]:
num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=2048
# hidden_size=4096
# hidden_size=9192

In [49]:
def run_training(fold, seed):

    seed_everything(seed)

    train = process_data(folds)
    test_ = process_data(test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"{MODEL_DIR}/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.load_state_dict(
        torch.load(f"{MODEL_DIR}/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [50]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [51]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.71265501215838
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.6155504816108279
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.20263613539113515
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.022887404180235334
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.02104678291125574
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.01929207446260585
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.018855544657486935
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.018203541739947267
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.018072179640116898
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.017530292169087462
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.017151300349961155
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.017212128473652735
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.016607041254747604
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.016719647941903934
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.01708855508300273
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.020239912594358127
SE

SEED: 940, FOLD: 2, EPOCH: 15, train_loss: 0.015264661807188953
SEED: 940 ,FOLD: 2, EPOCH: 15, valid_loss: 0.016178909223526716
SEED: 940, FOLD: 2, EPOCH: 16, train_loss: 0.01497469174311213
SEED: 940 ,FOLD: 2, EPOCH: 16, valid_loss: 0.01617078736631407
SEED: 940, FOLD: 2, EPOCH: 17, train_loss: 0.014639085253187712
SEED: 940 ,FOLD: 2, EPOCH: 17, valid_loss: 0.016073988905797403
SEED: 940, FOLD: 2, EPOCH: 18, train_loss: 0.014194936586031015
SEED: 940 ,FOLD: 2, EPOCH: 18, valid_loss: 0.016157698558850422
SEED: 940, FOLD: 2, EPOCH: 19, train_loss: 0.013755107163519098
SEED: 940 ,FOLD: 2, EPOCH: 19, valid_loss: 0.016103939722395606
SEED: 940, FOLD: 2, EPOCH: 20, train_loss: 0.013032844155163004
SEED: 940 ,FOLD: 2, EPOCH: 20, valid_loss: 0.016028448577142425
SEED: 940, FOLD: 2, EPOCH: 21, train_loss: 0.012259715101749136
SEED: 940 ,FOLD: 2, EPOCH: 21, valid_loss: 0.015977731284995873
SEED: 940, FOLD: 2, EPOCH: 22, train_loss: 0.01159796012106581
SEED: 940 ,FOLD: 2, EPOCH: 22, valid_loss: 

SEED: 1513, FOLD: 0, EPOCH: 5, train_loss: 0.017186988091123276
SEED: 1513 ,FOLD: 0, EPOCH: 5, valid_loss: 0.017066122498363256
SEED: 1513, FOLD: 0, EPOCH: 6, train_loss: 0.016723733233368916
SEED: 1513 ,FOLD: 0, EPOCH: 6, valid_loss: 0.01708546741348174
SEED: 1513, FOLD: 0, EPOCH: 7, train_loss: 0.016940113319003063
SEED: 1513 ,FOLD: 0, EPOCH: 7, valid_loss: 0.016805684245708916
SEED: 1513, FOLD: 0, EPOCH: 8, train_loss: 0.016447000596942245
SEED: 1513 ,FOLD: 0, EPOCH: 8, valid_loss: 0.01663975536616312
SEED: 1513, FOLD: 0, EPOCH: 9, train_loss: 0.016341927366844124
SEED: 1513 ,FOLD: 0, EPOCH: 9, valid_loss: 0.016825660442312557
SEED: 1513, FOLD: 0, EPOCH: 10, train_loss: 0.01624331605769154
SEED: 1513 ,FOLD: 0, EPOCH: 10, valid_loss: 0.01653326680469844
SEED: 1513, FOLD: 0, EPOCH: 11, train_loss: 0.016118605729138504
SEED: 1513 ,FOLD: 0, EPOCH: 11, valid_loss: 0.01657687360420823
SEED: 1513, FOLD: 0, EPOCH: 12, train_loss: 0.01604625099487063
SEED: 1513 ,FOLD: 0, EPOCH: 12, valid_los

SEED: 1513, FOLD: 2, EPOCH: 19, train_loss: 0.013735001326363155
SEED: 1513 ,FOLD: 2, EPOCH: 19, valid_loss: 0.015980667693333492
SEED: 1513, FOLD: 2, EPOCH: 20, train_loss: 0.013080957791079645
SEED: 1513 ,FOLD: 2, EPOCH: 20, valid_loss: 0.01598525404309233
SEED: 1513, FOLD: 2, EPOCH: 21, train_loss: 0.012425638708299484
SEED: 1513 ,FOLD: 2, EPOCH: 21, valid_loss: 0.01596826133835647
SEED: 1513, FOLD: 2, EPOCH: 22, train_loss: 0.011775551120872084
SEED: 1513 ,FOLD: 2, EPOCH: 22, valid_loss: 0.015992806137849886
SEED: 1513, FOLD: 2, EPOCH: 23, train_loss: 0.01123383699281924
SEED: 1513 ,FOLD: 2, EPOCH: 23, valid_loss: 0.015959526412189007
SEED: 1513, FOLD: 2, EPOCH: 24, train_loss: 0.011031053150477617
SEED: 1513 ,FOLD: 2, EPOCH: 24, valid_loss: 0.015964934809340373
SEED: 1513, FOLD: 3, EPOCH: 0, train_loss: 0.7129119487776272
SEED: 1513 ,FOLD: 3, EPOCH: 0, valid_loss: 0.6056828167703416
SEED: 1513, FOLD: 3, EPOCH: 1, train_loss: 0.20440754729012647
SEED: 1513 ,FOLD: 3, EPOCH: 1, valid

SEED: 1269, FOLD: 0, EPOCH: 8, train_loss: 0.016437698924994987
SEED: 1269 ,FOLD: 0, EPOCH: 8, valid_loss: 0.016766903611520927
SEED: 1269, FOLD: 0, EPOCH: 9, train_loss: 0.016349687969878964
SEED: 1269 ,FOLD: 0, EPOCH: 9, valid_loss: 0.016404570183820195
SEED: 1269, FOLD: 0, EPOCH: 10, train_loss: 0.01615612114361231
SEED: 1269 ,FOLD: 0, EPOCH: 10, valid_loss: 0.016572738635457225
SEED: 1269, FOLD: 0, EPOCH: 11, train_loss: 0.016069984373946983
SEED: 1269 ,FOLD: 0, EPOCH: 11, valid_loss: 0.016550880753331713
SEED: 1269, FOLD: 0, EPOCH: 12, train_loss: 0.016043862651871597
SEED: 1269 ,FOLD: 0, EPOCH: 12, valid_loss: 0.01642914457867543
SEED: 1269, FOLD: 0, EPOCH: 13, train_loss: 0.015859826332956985
SEED: 1269 ,FOLD: 0, EPOCH: 13, valid_loss: 0.016258138749334548
SEED: 1269, FOLD: 0, EPOCH: 14, train_loss: 0.015687429008708485
SEED: 1269 ,FOLD: 0, EPOCH: 14, valid_loss: 0.016251411599417526
SEED: 1269, FOLD: 0, EPOCH: 15, train_loss: 0.015430981279823227
SEED: 1269 ,FOLD: 0, EPOCH: 15,

SEED: 1269, FOLD: 2, EPOCH: 22, train_loss: 0.01170420006889364
SEED: 1269 ,FOLD: 2, EPOCH: 22, valid_loss: 0.016002678622802097
SEED: 1269, FOLD: 2, EPOCH: 23, train_loss: 0.011177998589540737
SEED: 1269 ,FOLD: 2, EPOCH: 23, valid_loss: 0.01598661175618569
SEED: 1269, FOLD: 2, EPOCH: 24, train_loss: 0.011005040946538034
SEED: 1269 ,FOLD: 2, EPOCH: 24, valid_loss: 0.016010826350086264
SEED: 1269, FOLD: 3, EPOCH: 0, train_loss: 0.7127395151317983
SEED: 1269 ,FOLD: 3, EPOCH: 0, valid_loss: 0.6009805036915673
SEED: 1269, FOLD: 3, EPOCH: 1, train_loss: 0.20380426646358724
SEED: 1269 ,FOLD: 3, EPOCH: 1, valid_loss: 0.023143276882668335
SEED: 1269, FOLD: 3, EPOCH: 2, train_loss: 0.02086722475571045
SEED: 1269 ,FOLD: 3, EPOCH: 2, valid_loss: 0.019640291937523417
SEED: 1269, FOLD: 3, EPOCH: 3, train_loss: 0.018817759429415066
SEED: 1269 ,FOLD: 3, EPOCH: 3, valid_loss: 0.018420808741615877
SEED: 1269, FOLD: 3, EPOCH: 4, train_loss: 0.017880389318410038
SEED: 1269 ,FOLD: 3, EPOCH: 4, valid_loss:

SEED: 1392, FOLD: 0, EPOCH: 11, train_loss: 0.016097641684978767
SEED: 1392 ,FOLD: 0, EPOCH: 11, valid_loss: 0.01650571460939116
SEED: 1392, FOLD: 0, EPOCH: 12, train_loss: 0.015965741748611133
SEED: 1392 ,FOLD: 0, EPOCH: 12, valid_loss: 0.01637556916102767
SEED: 1392, FOLD: 0, EPOCH: 13, train_loss: 0.0157690169694631
SEED: 1392 ,FOLD: 0, EPOCH: 13, valid_loss: 0.016336217367400725
SEED: 1392, FOLD: 0, EPOCH: 14, train_loss: 0.015673524970053764
SEED: 1392 ,FOLD: 0, EPOCH: 14, valid_loss: 0.01615924688263072
SEED: 1392, FOLD: 0, EPOCH: 15, train_loss: 0.015420452381173769
SEED: 1392 ,FOLD: 0, EPOCH: 15, valid_loss: 0.01605805480438802
SEED: 1392, FOLD: 0, EPOCH: 16, train_loss: 0.015164129031093224
SEED: 1392 ,FOLD: 0, EPOCH: 16, valid_loss: 0.015953205784575805
SEED: 1392, FOLD: 0, EPOCH: 17, train_loss: 0.01474861672445052
SEED: 1392 ,FOLD: 0, EPOCH: 17, valid_loss: 0.015852466432584658
SEED: 1392, FOLD: 0, EPOCH: 18, train_loss: 0.014353182180312233
SEED: 1392 ,FOLD: 0, EPOCH: 18, 

SEED: 1392, FOLD: 3, EPOCH: 0, train_loss: 0.7142266631126404
SEED: 1392 ,FOLD: 3, EPOCH: 0, valid_loss: 0.6155180831750234
SEED: 1392, FOLD: 3, EPOCH: 1, train_loss: 0.20392211054222306
SEED: 1392 ,FOLD: 3, EPOCH: 1, valid_loss: 0.023242825435267553
SEED: 1392, FOLD: 3, EPOCH: 2, train_loss: 0.02114195657381113
SEED: 1392 ,FOLD: 3, EPOCH: 2, valid_loss: 0.019583909668856196
SEED: 1392, FOLD: 3, EPOCH: 3, train_loss: 0.018881930389266083
SEED: 1392 ,FOLD: 3, EPOCH: 3, valid_loss: 0.01837532724150353
SEED: 1392, FOLD: 3, EPOCH: 4, train_loss: 0.017970061145614887
SEED: 1392 ,FOLD: 3, EPOCH: 4, valid_loss: 0.01772538246586919
SEED: 1392, FOLD: 3, EPOCH: 5, train_loss: 0.017067205012384533
SEED: 1392 ,FOLD: 3, EPOCH: 5, valid_loss: 0.017374441648523014
SEED: 1392, FOLD: 3, EPOCH: 6, train_loss: 0.016558105679417866
SEED: 1392 ,FOLD: 3, EPOCH: 6, valid_loss: 0.01751897142579158
SEED: 1392, FOLD: 3, EPOCH: 7, train_loss: 0.016435607835866402
SEED: 1392 ,FOLD: 3, EPOCH: 7, valid_loss: 0.0169

SEED: 1119, FOLD: 0, EPOCH: 14, train_loss: 0.015613302264524542
SEED: 1119 ,FOLD: 0, EPOCH: 14, valid_loss: 0.016190104445235595
SEED: 1119, FOLD: 0, EPOCH: 15, train_loss: 0.015402941127726133
SEED: 1119 ,FOLD: 0, EPOCH: 15, valid_loss: 0.016127841857572395
SEED: 1119, FOLD: 0, EPOCH: 16, train_loss: 0.015165757144922796
SEED: 1119 ,FOLD: 0, EPOCH: 16, valid_loss: 0.016036606238534052
SEED: 1119, FOLD: 0, EPOCH: 17, train_loss: 0.014885168536093788
SEED: 1119 ,FOLD: 0, EPOCH: 17, valid_loss: 0.015852090023044083
SEED: 1119, FOLD: 0, EPOCH: 18, train_loss: 0.014356870717112568
SEED: 1119 ,FOLD: 0, EPOCH: 18, valid_loss: 0.015828294834742945
SEED: 1119, FOLD: 0, EPOCH: 19, train_loss: 0.013911717044918434
SEED: 1119 ,FOLD: 0, EPOCH: 19, valid_loss: 0.01584018777228064
SEED: 1119, FOLD: 0, EPOCH: 20, train_loss: 0.013312242533741653
SEED: 1119 ,FOLD: 0, EPOCH: 20, valid_loss: 0.015705842783467636
SEED: 1119, FOLD: 0, EPOCH: 21, train_loss: 0.012599494200253832
SEED: 1119 ,FOLD: 0, EPOCH

SEED: 1119, FOLD: 3, EPOCH: 3, train_loss: 0.01885689827411071
SEED: 1119 ,FOLD: 3, EPOCH: 3, valid_loss: 0.01854017335507605
SEED: 1119, FOLD: 3, EPOCH: 4, train_loss: 0.01771986385996359
SEED: 1119 ,FOLD: 3, EPOCH: 4, valid_loss: 0.01831452227714989
SEED: 1119, FOLD: 3, EPOCH: 5, train_loss: 0.017591862750334152
SEED: 1119 ,FOLD: 3, EPOCH: 5, valid_loss: 0.017233004379603598
SEED: 1119, FOLD: 3, EPOCH: 6, train_loss: 0.01659698880183092
SEED: 1119 ,FOLD: 3, EPOCH: 6, valid_loss: 0.017232329895099003
SEED: 1119, FOLD: 3, EPOCH: 7, train_loss: 0.01637173562810041
SEED: 1119 ,FOLD: 3, EPOCH: 7, valid_loss: 0.016950516185412805
SEED: 1119, FOLD: 3, EPOCH: 8, train_loss: 0.01631733463348254
SEED: 1119 ,FOLD: 3, EPOCH: 8, valid_loss: 0.01719138626423147
SEED: 1119, FOLD: 3, EPOCH: 9, train_loss: 0.016300996212099773
SEED: 1119 ,FOLD: 3, EPOCH: 9, valid_loss: 0.016709826110551756
SEED: 1119, FOLD: 3, EPOCH: 10, train_loss: 0.016129332324624924
SEED: 1119 ,FOLD: 3, EPOCH: 10, valid_loss: 0.0

SEED: 1303, FOLD: 0, EPOCH: 17, train_loss: 0.014749038505597391
SEED: 1303 ,FOLD: 0, EPOCH: 17, valid_loss: 0.01602541644954019
SEED: 1303, FOLD: 0, EPOCH: 18, train_loss: 0.014413308878631695
SEED: 1303 ,FOLD: 0, EPOCH: 18, valid_loss: 0.015861721398929756
SEED: 1303, FOLD: 0, EPOCH: 19, train_loss: 0.013858832094980322
SEED: 1303 ,FOLD: 0, EPOCH: 19, valid_loss: 0.015698907586435478
SEED: 1303, FOLD: 0, EPOCH: 20, train_loss: 0.013188735419965309
SEED: 1303 ,FOLD: 0, EPOCH: 20, valid_loss: 0.015748933888971806
SEED: 1303, FOLD: 0, EPOCH: 21, train_loss: 0.012488213543226753
SEED: 1303 ,FOLD: 0, EPOCH: 21, valid_loss: 0.015759551690684423
SEED: 1303, FOLD: 0, EPOCH: 22, train_loss: 0.011798783961305584
SEED: 1303 ,FOLD: 0, EPOCH: 22, valid_loss: 0.015700462325993512
SEED: 1303, FOLD: 0, EPOCH: 23, train_loss: 0.011298596710506557
SEED: 1303 ,FOLD: 0, EPOCH: 23, valid_loss: 0.015690927755915456
SEED: 1303, FOLD: 0, EPOCH: 24, train_loss: 0.01108844459488772
SEED: 1303 ,FOLD: 0, EPOCH:

SEED: 1303, FOLD: 3, EPOCH: 6, train_loss: 0.0172701900338997
SEED: 1303 ,FOLD: 3, EPOCH: 6, valid_loss: 0.017572164845963318
SEED: 1303, FOLD: 3, EPOCH: 7, train_loss: 0.016599498054795506
SEED: 1303 ,FOLD: 3, EPOCH: 7, valid_loss: 0.016837053880509403
SEED: 1303, FOLD: 3, EPOCH: 8, train_loss: 0.016424978745804317
SEED: 1303 ,FOLD: 3, EPOCH: 8, valid_loss: 0.016884021357529692
SEED: 1303, FOLD: 3, EPOCH: 9, train_loss: 0.01617468695115784
SEED: 1303 ,FOLD: 3, EPOCH: 9, valid_loss: 0.016603480507102277
SEED: 1303, FOLD: 3, EPOCH: 10, train_loss: 0.016151860545294872
SEED: 1303 ,FOLD: 3, EPOCH: 10, valid_loss: 0.016580511298444536
SEED: 1303, FOLD: 3, EPOCH: 11, train_loss: 0.01606974820943846
SEED: 1303 ,FOLD: 3, EPOCH: 11, valid_loss: 0.016582716773781512
SEED: 1303, FOLD: 3, EPOCH: 12, train_loss: 0.015904985219780086
SEED: 1303 ,FOLD: 3, EPOCH: 12, valid_loss: 0.016717028390202258
SEED: 1303, FOLD: 3, EPOCH: 13, train_loss: 0.015768158762459305
SEED: 1303 ,FOLD: 3, EPOCH: 13, valid

In [52]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [53]:
len(target_cols)

206

In [54]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))

valid_results = train_targets_scored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

CV log_loss:  0.014299388238043196


- CV log_loss:  0.014761779358699672
- CV log_loss:  0.014519859174255039
- CV log_loss:  0.014525173864593479
- CV log_loss:  0.014354930596928602 # 3 umap features
- CV log_loss:  0.014353604854355429 # more umap features
- CV log_loss:  0.01436484670778641 # more hidden nodes
- CV log_loss:  0.014344688083211073
  - using predicted unscored targets as feature 
- CV log_loss:  0.013368097791623873
  - using given unscored targets as feature
  - bad in public lb
- CV log_loss:  0.01434373547175235
  - rankgauss predicted unscored targets
- CV log_loss:  0.014346100008158216
  - unscored targets pca/umap
- CV log_loss:  0.014328486629791769
  - NFOLDS=10, Epoch=20
- CV log_loss:  0.014299741080816082
  - NFOLDS=10, Epoch=20, 25
- CV log_loss:  0.014311301224480969
  - NFOLDS=10, Epoch=25
- CV log_loss:  0.01429269446076626
  - NFOLDS=10, Epoch=15, 25

In [55]:
# train = pd.read_pickle(f"../interim/23-train-score-pred.pkl")
# test = pd.read_pickle(f"../interim/23-test-score-pred.pkl")

In [56]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [57]:
EPOCHS = 25
# NFOLDS = 5

In [58]:
PMIN = 0.0005
PMAX = 0.9995
for c in train_targets_scored.columns:
    if c != "sig_id":
        train_targets_scored[c] = np.maximum(
            PMIN, np.minimum(PMAX, train_targets_scored[c]))

In [59]:
train_targets_scored.columns

Index(['sig_id', '5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',
       'acat_inhibitor', 'acetylcholine_receptor_agonist',
       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',
       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',
       'adenylyl_cyclase_activator',
       ...
       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
      dtype='object', length=207)

In [60]:
train = train[train_targets_scored.columns]
train.columns = [
    c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c
    for c in train.columns
]

In [61]:
test = test[train_targets_scored.columns]
test.columns = [
    c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c
    for c in test.columns
]

In [62]:
train

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,tropomyosin_receptor_kinase_inhibitor_pred,trpv_agonist_pred,trpv_antagonist_pred,tubulin_inhibitor_pred,tyrosine_kinase_inhibitor_pred,ubiquitin_specific_protease_inhibitor_pred,vegfr_inhibitor_pred,vitamin_b_pred,vitamin_d_receptor_agonist_pred,wnt_inhibitor_pred
0,id_000644bb2,0.000822,0.000502,0.001046,0.009744,0.038131,0.004170,0.003465,0.004213,0.000307,...,0.000406,0.000411,0.002389,0.001663,0.000883,0.000406,0.000745,0.001785,0.000324,0.002069
1,id_000779bfc,0.000524,0.000529,0.001235,0.013645,0.014442,0.004031,0.002750,0.004612,0.000311,...,0.000555,0.001166,0.001491,0.003492,0.001437,0.000278,0.001146,0.002702,0.001103,0.002312
2,id_000a6266a,0.001740,0.002226,0.001088,0.004657,0.009119,0.002140,0.001818,0.006204,0.000739,...,0.000405,0.001578,0.003114,0.004492,0.009765,0.000638,0.037448,0.003954,0.000287,0.001654
3,id_0015fd391,0.000375,0.000629,0.001531,0.007561,0.009475,0.001604,0.002851,0.002150,0.000341,...,0.000595,0.002003,0.001855,0.029406,0.002980,0.000783,0.001937,0.001951,0.000278,0.000580
4,id_001626bd3,0.000418,0.000743,0.003778,0.007727,0.010248,0.001827,0.004712,0.002950,0.000644,...,0.001129,0.001830,0.002106,0.002657,0.001869,0.000676,0.002021,0.002728,0.000578,0.001318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,0.004956,0.003686,0.000441,0.006845,0.029195,0.003761,0.001185,0.002988,0.000214,...,0.000358,0.000281,0.005247,0.000353,0.000502,0.000746,0.001878,0.000714,0.000579,0.000455
21944,id_fffb1ceed,0.001999,0.001331,0.000962,0.011255,0.034413,0.003121,0.001882,0.004319,0.000238,...,0.000474,0.000345,0.002403,0.001051,0.001516,0.000380,0.000909,0.001232,0.000498,0.000967
21945,id_fffb70c0c,0.001252,0.001061,0.002338,0.002183,0.002365,0.003345,0.001769,0.011916,0.001696,...,0.000433,0.001225,0.003078,0.000465,0.007309,0.000332,0.002859,0.001278,0.002171,0.005852
21946,id_fffcb9e7c,0.000310,0.000305,0.000328,0.001899,0.002964,0.000758,0.000700,0.000942,0.000094,...,0.000126,0.000249,0.000611,0.001738,0.001826,0.000154,0.001165,0.000361,0.000110,0.000428


In [63]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [64]:
# train["cp_time"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# train["cp_dose"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]
# test["cp_time"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# test["cp_dose"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]

In [65]:
from sklearn.preprocessing import QuantileTransformer

scored_target_pred = [
    c + "_pred" for c in train_targets_scored.columns if c != 'sig_id'
]

for col in (scored_target_pred):

    #     transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    #     transformer.fit(raw_vec)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer,
                     f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")
    else:
        transformer = pd.read_pickle(
            f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(
        vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [66]:
# train = train.drop('cp_type', axis=1)
# test = test.drop('cp_type', axis=1)

In [67]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [68]:
train

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0.537081,-0.400535,-0.069270,0.345114,1.591405,0.361417,0.807020,0.308358,0.345785,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
1,id_000779bfc,-0.076446,-0.331551,0.224693,0.775918,0.020212,0.309040,0.505060,0.435642,0.369333,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
2,id_000a6266a,1.409396,1.474718,-0.002142,-0.416476,-0.406838,-0.447629,-0.016101,0.831803,1.543713,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
3,id_0015fd391,-0.522910,-0.088334,0.611943,0.060545,-0.376046,-0.659599,0.554086,-0.560342,0.512390,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
4,id_001626bd3,-0.377659,0.155786,2.087313,0.082745,-0.315074,-0.575166,1.202246,-0.181915,1.386576,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.322966,1.996291,-1.447835,-0.042943,1.089451,0.211433,-0.493232,-0.165787,-0.272616,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21944,id_fffb1ceed,1.556072,0.897324,-0.217874,0.526463,1.401514,-0.038725,0.026458,0.341287,-0.091528,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21945,id_fffb70c0c,1.039404,0.625719,1.334725,-0.984314,-1.353316,0.046913,-0.049031,1.628580,2.185396,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21946,id_fffcb9e7c,-0.747835,-0.966265,-1.901506,-1.095612,-1.131262,-1.116467,-0.970621,-1.177981,-1.356659,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005


In [69]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.537081,-0.400535,-0.069270,0.345114,1.591405,0.361417,0.807020,0.308358,0.345785,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.076446,-0.331551,0.224693,0.775918,0.020212,0.309040,0.505060,0.435642,0.369333,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.409396,1.474718,-0.002142,-0.416476,-0.406838,-0.447629,-0.016101,0.831803,1.543713,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.522910,-0.088334,0.611943,0.060545,-0.376046,-0.659599,0.554086,-0.560342,0.512390,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.377659,0.155786,2.087313,0.082745,-0.315074,-0.575166,1.202246,-0.181915,1.386576,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.322966,1.996291,-1.447835,-0.042943,1.089451,0.211433,-0.493232,-0.165787,-0.272616,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.556072,0.897324,-0.217874,0.526463,1.401514,-0.038725,0.026458,0.341287,-0.091528,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,1.039404,0.625719,1.334725,-0.984314,-1.353316,0.046913,-0.049031,1.628580,2.185396,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.747835,-0.966265,-1.901506,-1.095612,-1.131262,-1.116467,-0.970621,-1.177981,-1.356659,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [70]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 413)
(21948, 414)
(3624, 207)
(21948, 207)
(3982, 207)


In [71]:
folds

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.537081,-0.400535,-0.069270,0.345114,1.591405,0.361417,0.807020,0.308358,0.345785,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.076446,-0.331551,0.224693,0.775918,0.020212,0.309040,0.505060,0.435642,0.369333,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.409396,1.474718,-0.002142,-0.416476,-0.406838,-0.447629,-0.016101,0.831803,1.543713,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.522910,-0.088334,0.611943,0.060545,-0.376046,-0.659599,0.554086,-0.560342,0.512390,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.377659,0.155786,2.087313,0.082745,-0.315074,-0.575166,1.202246,-0.181915,1.386576,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.322966,1.996291,-1.447835,-0.042943,1.089451,0.211433,-0.493232,-0.165787,-0.272616,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.556072,0.897324,-0.217874,0.526463,1.401514,-0.038725,0.026458,0.341287,-0.091528,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,1.039404,0.625719,1.334725,-0.984314,-1.353316,0.046913,-0.049031,1.628580,2.185396,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.747835,-0.966265,-1.901506,-1.095612,-1.131262,-1.116467,-0.970621,-1.177981,-1.356659,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [72]:
def process_data(data):

    #     data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    #     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2, 0:0, 1:1, 2:2})
    #     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1, 0:0, 1:1})

    # --------------------- Normalize ---------------------
    #     for col in GENES:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #     for col in CELLS:
    #         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))

    #--------------------- Removing Skewness ---------------------
    #     for col in GENES + CELLS:
    #         if(abs(data[col].skew()) > 0.75):

    #             if(data[col].skew() < 0): # neg-skewness
    #                 data[col] = data[col].max() - data[col] + 1
    #                 data[col] = np.sqrt(data[col])

    #             else:
    #                 data[col] = np.sqrt(data[col])

    return data

In [73]:
feature_cols = [c for c in folds.columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

206

In [74]:
feature_cols

['5-alpha_reductase_inhibitor_pred',
 '11-beta-hsd1_inhibitor_pred',
 'acat_inhibitor_pred',
 'acetylcholine_receptor_agonist_pred',
 'acetylcholine_receptor_antagonist_pred',
 'acetylcholinesterase_inhibitor_pred',
 'adenosine_receptor_agonist_pred',
 'adenosine_receptor_antagonist_pred',
 'adenylyl_cyclase_activator_pred',
 'adrenergic_receptor_agonist_pred',
 'adrenergic_receptor_antagonist_pred',
 'akt_inhibitor_pred',
 'aldehyde_dehydrogenase_inhibitor_pred',
 'alk_inhibitor_pred',
 'ampk_activator_pred',
 'analgesic_pred',
 'androgen_receptor_agonist_pred',
 'androgen_receptor_antagonist_pred',
 'anesthetic_-_local_pred',
 'angiogenesis_inhibitor_pred',
 'angiotensin_receptor_antagonist_pred',
 'anti-inflammatory_pred',
 'antiarrhythmic_pred',
 'antibiotic_pred',
 'anticonvulsant_pred',
 'antifungal_pred',
 'antihistamine_pred',
 'antimalarial_pred',
 'antioxidant_pred',
 'antiprotozoal_pred',
 'antiviral_pred',
 'apoptosis_stimulant_pred',
 'aromatase_inhibitor_pred',
 'atm_kina

In [75]:
folds

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.537081,-0.400535,-0.069270,0.345114,1.591405,0.361417,0.807020,0.308358,0.345785,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.076446,-0.331551,0.224693,0.775918,0.020212,0.309040,0.505060,0.435642,0.369333,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.409396,1.474718,-0.002142,-0.416476,-0.406838,-0.447629,-0.016101,0.831803,1.543713,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.522910,-0.088334,0.611943,0.060545,-0.376046,-0.659599,0.554086,-0.560342,0.512390,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.377659,0.155786,2.087313,0.082745,-0.315074,-0.575166,1.202246,-0.181915,1.386576,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.322966,1.996291,-1.447835,-0.042943,1.089451,0.211433,-0.493232,-0.165787,-0.272616,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.556072,0.897324,-0.217874,0.526463,1.401514,-0.038725,0.026458,0.341287,-0.091528,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,1.039404,0.625719,1.334725,-0.984314,-1.353316,0.046913,-0.049031,1.628580,2.185396,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.747835,-0.966265,-1.901506,-1.095612,-1.131262,-1.116467,-0.970621,-1.177981,-1.356659,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [76]:
EPOCHS = 25
num_features = len(feature_cols)
num_targets = len(target_cols)
hidden_size = 1024
# hidden_size=4096
# hidden_size=9192

In [77]:
def run_training(fold, seed):

    seed_everything(seed)

    train = (folds)
    test_ = (test)

    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=5e-3,
                                 weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.2,
                                              div_factor=1e3,
                                              max_lr=1e-2,
                                              epochs=EPOCHS,
                                              steps_per_epoch=len(trainloader))

    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):

        train_loss = train_fn(model, optimizer, scheduler, loss_fn,
                              trainloader, DEVICE)
        print(
            f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}"
        )
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(
            f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}"
        )

        if valid_loss < best_loss:

            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(),
                       f"{MODEL_DIR}/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth")

        elif (EARLY_STOP == True):

            early_step += 1
            if (early_step >= early_stopping_steps):
                break

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    model.load_state_dict(
        torch.load(f"{MODEL_DIR}/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)

    #   if not IS_TRAIN:
    # valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    # oof[val_idx] = valid_preds

    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [78]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))

    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [79]:
SEED = [940, 1513, 1269, 1392, 1119, 1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:

    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.718145609765813
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.6396683785650465
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.2655495387920435
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.03087172212286128
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.024781704434882038
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.021605933499005105
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.021165007455409435
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.02030052699976497
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.020388307022875633
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.019968161669870216
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.019876298670103584
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.019536862874196634
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.01959703001531138
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.019616645864314504
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.01935468875951525
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.019411795255210664
SEE

SEED: 940, FOLD: 2, EPOCH: 15, train_loss: 0.01908902856318847
SEED: 940 ,FOLD: 2, EPOCH: 15, valid_loss: 0.0190724174802502
SEED: 940, FOLD: 2, EPOCH: 16, train_loss: 0.018878019482329273
SEED: 940 ,FOLD: 2, EPOCH: 16, valid_loss: 0.019048736948106024
SEED: 940, FOLD: 2, EPOCH: 17, train_loss: 0.01876743286308171
SEED: 940 ,FOLD: 2, EPOCH: 17, valid_loss: 0.019006037360264197
SEED: 940, FOLD: 2, EPOCH: 18, train_loss: 0.01860826667668163
SEED: 940 ,FOLD: 2, EPOCH: 18, valid_loss: 0.018901370155314606
SEED: 940, FOLD: 2, EPOCH: 19, train_loss: 0.018438921755422718
SEED: 940 ,FOLD: 2, EPOCH: 19, valid_loss: 0.018788124434649944
SEED: 940, FOLD: 2, EPOCH: 20, train_loss: 0.01829090136764706
SEED: 940 ,FOLD: 2, EPOCH: 20, valid_loss: 0.0187627664870686
SEED: 940, FOLD: 2, EPOCH: 21, train_loss: 0.018112322072619976
SEED: 940 ,FOLD: 2, EPOCH: 21, valid_loss: 0.01869153914352258
SEED: 940, FOLD: 2, EPOCH: 22, train_loss: 0.017946369811028675
SEED: 940 ,FOLD: 2, EPOCH: 22, valid_loss: 0.0186

SEED: 1513, FOLD: 0, EPOCH: 5, train_loss: 0.01993294108820998
SEED: 1513 ,FOLD: 0, EPOCH: 5, valid_loss: 0.019761933634678524
SEED: 1513, FOLD: 0, EPOCH: 6, train_loss: 0.01957793136977631
SEED: 1513 ,FOLD: 0, EPOCH: 6, valid_loss: 0.01940046726829476
SEED: 1513, FOLD: 0, EPOCH: 7, train_loss: 0.019405940493595772
SEED: 1513 ,FOLD: 0, EPOCH: 7, valid_loss: 0.019453050568699837
SEED: 1513, FOLD: 0, EPOCH: 8, train_loss: 0.019299754144056984
SEED: 1513 ,FOLD: 0, EPOCH: 8, valid_loss: 0.01950250855750508
SEED: 1513, FOLD: 0, EPOCH: 9, train_loss: 0.019296473108124042
SEED: 1513 ,FOLD: 0, EPOCH: 9, valid_loss: 0.019363499246537685
SEED: 1513, FOLD: 0, EPOCH: 10, train_loss: 0.019168155834726665
SEED: 1513 ,FOLD: 0, EPOCH: 10, valid_loss: 0.019305344257089827
SEED: 1513, FOLD: 0, EPOCH: 11, train_loss: 0.01915799424160218
SEED: 1513 ,FOLD: 0, EPOCH: 11, valid_loss: 0.019280485084487334
SEED: 1513, FOLD: 0, EPOCH: 12, train_loss: 0.01922717774151892
SEED: 1513 ,FOLD: 0, EPOCH: 12, valid_los

SEED: 1513, FOLD: 2, EPOCH: 19, train_loss: 0.018485438429575035
SEED: 1513 ,FOLD: 2, EPOCH: 19, valid_loss: 0.01881455464495553
SEED: 1513, FOLD: 2, EPOCH: 20, train_loss: 0.018311332047417545
SEED: 1513 ,FOLD: 2, EPOCH: 20, valid_loss: 0.01876267169912656
SEED: 1513, FOLD: 2, EPOCH: 21, train_loss: 0.018134920898339024
SEED: 1513 ,FOLD: 2, EPOCH: 21, valid_loss: 0.018702793038553663
SEED: 1513, FOLD: 2, EPOCH: 22, train_loss: 0.017979617433055588
SEED: 1513 ,FOLD: 2, EPOCH: 22, valid_loss: 0.01868340962876876
SEED: 1513, FOLD: 2, EPOCH: 23, train_loss: 0.01786795636450035
SEED: 1513 ,FOLD: 2, EPOCH: 23, valid_loss: 0.01865694599433078
SEED: 1513, FOLD: 2, EPOCH: 24, train_loss: 0.017796520413695904
SEED: 1513 ,FOLD: 2, EPOCH: 24, valid_loss: 0.01865293075227075
SEED: 1513, FOLD: 3, EPOCH: 0, train_loss: 0.7181667210399241
SEED: 1513 ,FOLD: 3, EPOCH: 0, valid_loss: 0.632506572537952
SEED: 1513, FOLD: 3, EPOCH: 1, train_loss: 0.2637901277537795
SEED: 1513 ,FOLD: 3, EPOCH: 1, valid_loss

SEED: 1269, FOLD: 0, EPOCH: 8, train_loss: 0.01929672677879748
SEED: 1269 ,FOLD: 0, EPOCH: 8, valid_loss: 0.019351669897635777
SEED: 1269, FOLD: 0, EPOCH: 9, train_loss: 0.019194220584155857
SEED: 1269 ,FOLD: 0, EPOCH: 9, valid_loss: 0.01921900806741582
SEED: 1269, FOLD: 0, EPOCH: 10, train_loss: 0.019187275671224663
SEED: 1269 ,FOLD: 0, EPOCH: 10, valid_loss: 0.01942023283077611
SEED: 1269, FOLD: 0, EPOCH: 11, train_loss: 0.019160444213860275
SEED: 1269 ,FOLD: 0, EPOCH: 11, valid_loss: 0.019336542735497158
SEED: 1269, FOLD: 0, EPOCH: 12, train_loss: 0.019149161389340526
SEED: 1269 ,FOLD: 0, EPOCH: 12, valid_loss: 0.019354186434712674
SEED: 1269, FOLD: 0, EPOCH: 13, train_loss: 0.019106623433206394
SEED: 1269 ,FOLD: 0, EPOCH: 13, valid_loss: 0.01918785274028778
SEED: 1269, FOLD: 0, EPOCH: 14, train_loss: 0.019026904191443886
SEED: 1269 ,FOLD: 0, EPOCH: 14, valid_loss: 0.019259562198486593
SEED: 1269, FOLD: 0, EPOCH: 15, train_loss: 0.018917792868139088
SEED: 1269 ,FOLD: 0, EPOCH: 15, v

SEED: 1269, FOLD: 2, EPOCH: 22, train_loss: 0.01793018566525501
SEED: 1269 ,FOLD: 2, EPOCH: 22, valid_loss: 0.018672290878991287
SEED: 1269, FOLD: 2, EPOCH: 23, train_loss: 0.017796212132426277
SEED: 1269 ,FOLD: 2, EPOCH: 23, valid_loss: 0.018651734727124374
SEED: 1269, FOLD: 2, EPOCH: 24, train_loss: 0.017732760558525722
SEED: 1269 ,FOLD: 2, EPOCH: 24, valid_loss: 0.018659216351807117
SEED: 1269, FOLD: 3, EPOCH: 0, train_loss: 0.7206153126730435
SEED: 1269 ,FOLD: 3, EPOCH: 0, valid_loss: 0.6407128638691373
SEED: 1269, FOLD: 3, EPOCH: 1, train_loss: 0.26238041964993963
SEED: 1269 ,FOLD: 3, EPOCH: 1, valid_loss: 0.031742003849811025
SEED: 1269, FOLD: 3, EPOCH: 2, train_loss: 0.025002557163437206
SEED: 1269 ,FOLD: 3, EPOCH: 2, valid_loss: 0.021740789835651714
SEED: 1269, FOLD: 3, EPOCH: 3, train_loss: 0.02125154103597869
SEED: 1269 ,FOLD: 3, EPOCH: 3, valid_loss: 0.02020814859618743
SEED: 1269, FOLD: 3, EPOCH: 4, train_loss: 0.02042561950351017
SEED: 1269 ,FOLD: 3, EPOCH: 4, valid_loss: 

SEED: 1392, FOLD: 0, EPOCH: 11, train_loss: 0.019185428143195484
SEED: 1392 ,FOLD: 0, EPOCH: 11, valid_loss: 0.019425188708636496
SEED: 1392, FOLD: 0, EPOCH: 12, train_loss: 0.019117515507167664
SEED: 1392 ,FOLD: 0, EPOCH: 12, valid_loss: 0.01923563289973471
SEED: 1392, FOLD: 0, EPOCH: 13, train_loss: 0.01909494359532128
SEED: 1392 ,FOLD: 0, EPOCH: 13, valid_loss: 0.01919711950338549
SEED: 1392, FOLD: 0, EPOCH: 14, train_loss: 0.01905156944648943
SEED: 1392 ,FOLD: 0, EPOCH: 14, valid_loss: 0.019208221489356622
SEED: 1392, FOLD: 0, EPOCH: 15, train_loss: 0.01895094090613766
SEED: 1392 ,FOLD: 0, EPOCH: 15, valid_loss: 0.019241492057012186
SEED: 1392, FOLD: 0, EPOCH: 16, train_loss: 0.018854141208356705
SEED: 1392 ,FOLD: 0, EPOCH: 16, valid_loss: 0.019056764638258353
SEED: 1392, FOLD: 0, EPOCH: 17, train_loss: 0.018690567787574684
SEED: 1392 ,FOLD: 0, EPOCH: 17, valid_loss: 0.019033711300128035
SEED: 1392, FOLD: 0, EPOCH: 18, train_loss: 0.018593294977925827
SEED: 1392 ,FOLD: 0, EPOCH: 18

SEED: 1392, FOLD: 3, EPOCH: 0, train_loss: 0.7197182023006937
SEED: 1392 ,FOLD: 3, EPOCH: 0, valid_loss: 0.6392995350890689
SEED: 1392, FOLD: 3, EPOCH: 1, train_loss: 0.26422847135235433
SEED: 1392 ,FOLD: 3, EPOCH: 1, valid_loss: 0.030144351120624278
SEED: 1392, FOLD: 3, EPOCH: 2, train_loss: 0.024936743566523426
SEED: 1392 ,FOLD: 3, EPOCH: 2, valid_loss: 0.02185430729554759
SEED: 1392, FOLD: 3, EPOCH: 3, train_loss: 0.02127879485487938
SEED: 1392 ,FOLD: 3, EPOCH: 3, valid_loss: 0.020196364778611395
SEED: 1392, FOLD: 3, EPOCH: 4, train_loss: 0.020526168948930244
SEED: 1392 ,FOLD: 3, EPOCH: 4, valid_loss: 0.019783337083127763
SEED: 1392, FOLD: 3, EPOCH: 5, train_loss: 0.019968282296389774
SEED: 1392 ,FOLD: 3, EPOCH: 5, valid_loss: 0.019704107609060075
SEED: 1392, FOLD: 3, EPOCH: 6, train_loss: 0.019691256320346958
SEED: 1392 ,FOLD: 3, EPOCH: 6, valid_loss: 0.01934542755285899
SEED: 1392, FOLD: 3, EPOCH: 7, train_loss: 0.01938487211431282
SEED: 1392 ,FOLD: 3, EPOCH: 7, valid_loss: 0.0193

SEED: 1119, FOLD: 0, EPOCH: 14, train_loss: 0.01902765907563161
SEED: 1119 ,FOLD: 0, EPOCH: 14, valid_loss: 0.019311419791645475
SEED: 1119, FOLD: 0, EPOCH: 15, train_loss: 0.018961713203917378
SEED: 1119 ,FOLD: 0, EPOCH: 15, valid_loss: 0.019139828470846016
SEED: 1119, FOLD: 0, EPOCH: 16, train_loss: 0.018824595337112743
SEED: 1119 ,FOLD: 0, EPOCH: 16, valid_loss: 0.019017116373611823
SEED: 1119, FOLD: 0, EPOCH: 17, train_loss: 0.01871909660057745
SEED: 1119 ,FOLD: 0, EPOCH: 17, valid_loss: 0.018945430715878803
SEED: 1119, FOLD: 0, EPOCH: 18, train_loss: 0.018545195946226948
SEED: 1119 ,FOLD: 0, EPOCH: 18, valid_loss: 0.018942948948178027
SEED: 1119, FOLD: 0, EPOCH: 19, train_loss: 0.018417938787868057
SEED: 1119 ,FOLD: 0, EPOCH: 19, valid_loss: 0.018867167230281565
SEED: 1119, FOLD: 0, EPOCH: 20, train_loss: 0.018219036684520004
SEED: 1119 ,FOLD: 0, EPOCH: 20, valid_loss: 0.018851352027720876
SEED: 1119, FOLD: 0, EPOCH: 21, train_loss: 0.018017235981381458
SEED: 1119 ,FOLD: 0, EPOCH:

SEED: 1119, FOLD: 3, EPOCH: 3, train_loss: 0.021170932205690853
SEED: 1119 ,FOLD: 3, EPOCH: 3, valid_loss: 0.020263576569656532
SEED: 1119, FOLD: 3, EPOCH: 4, train_loss: 0.020481691655257473
SEED: 1119 ,FOLD: 3, EPOCH: 4, valid_loss: 0.019788341803683177
SEED: 1119, FOLD: 3, EPOCH: 5, train_loss: 0.019911673189937206
SEED: 1119 ,FOLD: 3, EPOCH: 5, valid_loss: 0.019516131323244836
SEED: 1119, FOLD: 3, EPOCH: 6, train_loss: 0.019570102834183235
SEED: 1119 ,FOLD: 3, EPOCH: 6, valid_loss: 0.019379828435679276
SEED: 1119, FOLD: 3, EPOCH: 7, train_loss: 0.01937524053821529
SEED: 1119 ,FOLD: 3, EPOCH: 7, valid_loss: 0.01932408691694339
SEED: 1119, FOLD: 3, EPOCH: 8, train_loss: 0.01928944831741029
SEED: 1119 ,FOLD: 3, EPOCH: 8, valid_loss: 0.019319431338873174
SEED: 1119, FOLD: 3, EPOCH: 9, train_loss: 0.019225612984619278
SEED: 1119 ,FOLD: 3, EPOCH: 9, valid_loss: 0.0192488305684593
SEED: 1119, FOLD: 3, EPOCH: 10, train_loss: 0.01921041463704213
SEED: 1119 ,FOLD: 3, EPOCH: 10, valid_loss: 0

SEED: 1303, FOLD: 0, EPOCH: 17, train_loss: 0.01867948928712935
SEED: 1303 ,FOLD: 0, EPOCH: 17, valid_loss: 0.01907795812520716
SEED: 1303, FOLD: 0, EPOCH: 18, train_loss: 0.01857534445066383
SEED: 1303 ,FOLD: 0, EPOCH: 18, valid_loss: 0.018924188075794116
SEED: 1303, FOLD: 0, EPOCH: 19, train_loss: 0.018386608610550564
SEED: 1303 ,FOLD: 0, EPOCH: 19, valid_loss: 0.018868224798805185
SEED: 1303, FOLD: 0, EPOCH: 20, train_loss: 0.018225621702014538
SEED: 1303 ,FOLD: 0, EPOCH: 20, valid_loss: 0.01879824625535144
SEED: 1303, FOLD: 0, EPOCH: 21, train_loss: 0.0180242663330358
SEED: 1303 ,FOLD: 0, EPOCH: 21, valid_loss: 0.018807893618941307
SEED: 1303, FOLD: 0, EPOCH: 22, train_loss: 0.017854232314056244
SEED: 1303 ,FOLD: 0, EPOCH: 22, valid_loss: 0.018752904298404854
SEED: 1303, FOLD: 0, EPOCH: 23, train_loss: 0.01773616261240365
SEED: 1303 ,FOLD: 0, EPOCH: 23, valid_loss: 0.018739088541931577
SEED: 1303, FOLD: 0, EPOCH: 24, train_loss: 0.017676962593543358
SEED: 1303 ,FOLD: 0, EPOCH: 24, 

SEED: 1303, FOLD: 3, EPOCH: 6, train_loss: 0.01956681461761827
SEED: 1303 ,FOLD: 3, EPOCH: 6, valid_loss: 0.019432169799175527
SEED: 1303, FOLD: 3, EPOCH: 7, train_loss: 0.019343144786746605
SEED: 1303 ,FOLD: 3, EPOCH: 7, valid_loss: 0.019358665371934574
SEED: 1303, FOLD: 3, EPOCH: 8, train_loss: 0.01927231543737909
SEED: 1303 ,FOLD: 3, EPOCH: 8, valid_loss: 0.01926756908910142
SEED: 1303, FOLD: 3, EPOCH: 9, train_loss: 0.019238362466727478
SEED: 1303 ,FOLD: 3, EPOCH: 9, valid_loss: 0.01920522666639752
SEED: 1303, FOLD: 3, EPOCH: 10, train_loss: 0.01921104075576084
SEED: 1303 ,FOLD: 3, EPOCH: 10, valid_loss: 0.019247974476052657
SEED: 1303, FOLD: 3, EPOCH: 11, train_loss: 0.019205585446046745
SEED: 1303 ,FOLD: 3, EPOCH: 11, valid_loss: 0.019313448419173557
SEED: 1303, FOLD: 3, EPOCH: 12, train_loss: 0.01914144963349985
SEED: 1303 ,FOLD: 3, EPOCH: 12, valid_loss: 0.019254831079807546
SEED: 1303, FOLD: 3, EPOCH: 13, train_loss: 0.019129766826180443
SEED: 1303 ,FOLD: 3, EPOCH: 13, valid_l

In [80]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-stack-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-stack-pred.pkl")

In [81]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_scored.drop(columns=target_cols).merge(
    train[['sig_id'] + target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

y_pred = np.minimum(SMAX, np.maximum(SMIN, y_pred))

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]

print("CV log_loss: ", score)

CV log_loss:  0.014318743530085496


In [82]:
# for c in test.columns:
#     if c != "sig_id":
#         test[c] = np.maximum(PMIN, np.minimum(PMAX, test[c]))

sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id'] +
                                                             target_cols],
                                                        on='sig_id',
                                                        how='left').fillna(0)
# sub.to_csv('submission.csv', index=False)
sub.to_csv('submission_2stageNN_with_ns_oldcv_0.01822.csv', index=False)

In [83]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001149,0.001465,0.001759,0.011808,0.019730,0.004797,0.002614,0.006231,0.000631,...,0.000904,0.001267,0.002736,0.001985,0.001602,0.000928,0.002326,0.001742,0.002772,0.001710
1,id_001897cda,0.000514,0.002112,0.001632,0.004301,0.002107,0.002263,0.006319,0.012393,0.005252,...,0.001019,0.000657,0.006524,0.000494,0.009789,0.000507,0.003183,0.000933,0.001540,0.002399
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.001224,0.001149,0.001648,0.014157,0.020908,0.003767,0.003095,0.003870,0.000582,...,0.000742,0.001054,0.004266,0.003588,0.007000,0.000887,0.002099,0.002357,0.000819,0.003220
4,id_0027f1083,0.001674,0.001859,0.002772,0.014973,0.020677,0.004415,0.003962,0.003042,0.000745,...,0.001228,0.000941,0.004282,0.003141,0.002014,0.000936,0.001957,0.001982,0.000786,0.001158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.001128,0.001365,0.001902,0.003931,0.006649,0.002779,0.001911,0.003058,0.000721,...,0.000871,0.005010,0.002657,0.305395,0.006896,0.001572,0.003425,0.001523,0.000795,0.001611
3978,id_ff925dd0d,0.007152,0.002935,0.001411,0.010189,0.020856,0.006774,0.004249,0.004423,0.000803,...,0.000974,0.000985,0.003498,0.002821,0.002662,0.001072,0.005414,0.002896,0.000891,0.002178
3979,id_ffb710450,0.001556,0.001746,0.001836,0.012438,0.020764,0.006126,0.003009,0.004530,0.000657,...,0.000902,0.001329,0.002759,0.004060,0.002292,0.000868,0.002032,0.002114,0.000941,0.001380
3980,id_ffbb869f2,0.002759,0.001698,0.001443,0.018013,0.023979,0.006361,0.007311,0.003258,0.000800,...,0.000742,0.000832,0.003661,0.002239,0.001574,0.000817,0.001875,0.002406,0.001128,0.003748
