- a notebook to save preprocessing model and train/save NN models
- all necessary ouputs are stored in MODEL_DIR = output/kaggle/working/model
    - put those into dataset, and load it from inference notebook

In [1]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
sys.path.append('../input/umaplearn/umap')

%mkdir model
%mkdir interim

from scipy.sparse.csgraph import connected_components
from umap import UMAP
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
import time

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,FactorAnalysis
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print(torch.cuda.is_available())
import warnings
# warnings.filterwarnings('ignore')

True


In [2]:
torch.__version__

'1.6.0'

In [3]:
NB = '25'

IS_TRAIN = True
MODEL_DIR = "model" # "../model"
INT_DIR = "interim" # "../interim"

NSEEDS = 5  # 5
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 15
BATCH_SIZE = 256
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

NFOLDS = 5  # 5

PMIN = 0.0005
PMAX = 0.9995
SMIN = 0.0
SMAX = 1.0

In [4]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [5]:
train_targets_nonscored = train_targets_nonscored.loc[:, train_targets_nonscored.sum() != 0]
print(train_targets_nonscored.shape)

(23814, 332)


In [6]:
# for c in train_targets_scored.columns:
#     if c != "sig_id":
#         train_targets_scored[c] = np.maximum(PMIN, np.minimum(PMAX, train_targets_scored[c]))
for c in train_targets_nonscored.columns:
    if c != "sig_id":
        train_targets_nonscored[c] = np.maximum(PMIN, np.minimum(PMAX, train_targets_nonscored[c]))

In [7]:
print("(nsamples, nfeatures)")
print(train_features.shape)
print(train_targets_scored.shape)
print(train_targets_nonscored.shape)
print(test_features.shape)
print(sample_submission.shape)

(nsamples, nfeatures)
(23814, 876)
(23814, 207)
(23814, 332)
(3982, 876)
(3982, 207)


In [8]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [9]:
def seed_everything(seed=1903):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=1903)

In [10]:
# GENES
n_comp = 90
n_dim = 45

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp, random_state=1903).fit(data[GENES])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[GENES])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_g.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_g.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_g.pkl')

data2 = (fa.transform(data[GENES]))
data3 = (umap.transform(data[GENES]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_G-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_G-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_G-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_G-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

#CELLS
n_comp = 50
n_dim = 25

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])

if IS_TRAIN:
    fa = FactorAnalysis(n_components=n_comp, random_state=1903).fit(data[CELLS])
    pd.to_pickle(fa, f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = UMAP(n_components=n_dim, random_state=1903).fit(data[CELLS])
    pd.to_pickle(umap, f'{MODEL_DIR}/{NB}_umap_c.pkl')
else:
    fa = pd.read_pickle(f'{MODEL_DIR}/{NB}_factor_analysis_c.pkl')
    umap = pd.read_pickle(f'{MODEL_DIR}/{NB}_umap_c.pkl')
    
data2 = (fa.transform(data[CELLS]))
data3 = (umap.fit_transform(data[CELLS]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]
train3 = data3[:train_features.shape[0]]
test3 = data3[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'fa_C-{i}' for i in range(n_comp)])
train3 = pd.DataFrame(train3, columns=[f'umap_C-{i}' for i in range(n_dim)])
test2 = pd.DataFrame(test2, columns=[f'fa_C-{i}' for i in range(n_comp)])
test3 = pd.DataFrame(test3, columns=[f'umap_C-{i}' for i in range(n_dim)])

train_features = pd.concat((train_features, train2, train3), axis=1)
test_features = pd.concat((test_features, test2, test3), axis=1)

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]

In [11]:
from sklearn.preprocessing import QuantileTransformer

for col in (GENES + CELLS):
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = pd.concat([train_features, test_features])[col].values.reshape(vec_len+vec_len_test, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100, random_state=123, output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer, f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')
    else:
        transformer = pd.read_pickle(f'{MODEL_DIR}/{NB}_{col}_quantile_transformer.pkl')        

    train_features[col] = transformer.transform(train_features[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [12]:
# PCAS = [col for col in train_features.columns if col.startswith('pca_')]
# UMAPS = [col for col in train_features.columns if col.startswith('umap_')]

In [13]:
# from sklearn.preprocessing import PolynomialFeatures
# n_deg = 2

# data = pd.concat([pd.DataFrame(train_features[PCAS]), pd.DataFrame(test_features[PCAS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[PCAS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)


# data = pd.concat([pd.DataFrame(train_features[UMAPS]), pd.DataFrame(test_features[UMAPS])])
# data2 = (PolynomialFeatures(degree=n_deg, include_bias=False).fit_transform(data[UMAPS]))

# # print(data2)
# # data4 = (UMAP(n_components=n_dim, n_neighbors=5, random_state=1903).fit_transform(data[GENES]))
# # data5 = (UMAP(n_components=n_dim, min_dist=0.01, random_state=1903).fit_transform(data[GENES]))

# train2 = data2[:train_features.shape[0]]
# test2 = data2[-test_features.shape[0]:]

# # print(train2.shape)
# train2 = pd.DataFrame(train2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])
# test2 = pd.DataFrame(test2, columns=[f'poly_C-{i}' for i in range(train2.shape[1])])

# # drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
# # train_features = pd.concat((train_features, train2, train3, train4, train5), axis=1)
# # test_features = pd.concat((test_features, test2, test3, test4, test5), axis=1)
# train_features = pd.concat((train_features, train2), axis=1)
# test_features = pd.concat((test_features, test2), axis=1)

In [14]:
print(train_features.shape)
print(test_features.shape)

(23814, 1086)
(3982, 1086)


In [15]:
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train_features.merge(train_targets_nonscored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_nonscored.columns]

In [16]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [17]:
print(target.shape)
print(train_features.shape)
print(test_features.shape)
print(train.shape)
print(test.shape)

(21948, 332)
(23814, 1086)
(3982, 1086)
(21948, 1416)
(3624, 1085)


In [18]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [19]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,vasopressin_receptor_antagonist,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,xanthine_oxidase_inhibitor,xiap_inhibitor,kfold
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [20]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1416)
(21948, 1417)
(3624, 1085)
(21948, 332)
(3982, 207)


In [21]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

In [22]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [23]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.15)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.3)
        self.dense2 = nn.Linear(hidden_size, hidden_size)
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [24]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
#     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
#     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

# --------------------- Normalize ---------------------
#     for col in GENES:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#     for col in CELLS:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#--------------------- Removing Skewness ---------------------
#     for col in GENES + CELLS:
#         if(abs(data[col].skew()) > 0.75):
            
#             if(data[col].skew() < 0): # neg-skewness
#                 data[col] = data[col].max() - data[col] + 1
#                 data[col] = np.sqrt(data[col])
            
#             else:
#                 data[col] = np.sqrt(data[col])
    
    return data

In [25]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

1087

In [26]:
num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=2048
# hidden_size=4096
# hidden_size=9192

In [27]:
def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.2, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler,loss_fn,trainloader, DEVICE)
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"model/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.load_state_dict(torch.load(f"model/{NB}-nonscored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions

In [28]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [29]:
SEED = [940, 1513, 1269,1392,1119,1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

print(oof.shape)
print(predictions.shape)

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.6479452919700871
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.24815115415387684
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.03601686417570581
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.009409155800110765
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.00938437124579281
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.009297710274242692
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.00926585112819853
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.009222983330902126
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.009361455783895824
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.009119504545297887
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.008960283104924189
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.009035923569980595
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.008879928458212078
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.00893180077481601
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.008805801126889992
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.008888928892297877

In [30]:
train.to_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [31]:
len(target_cols)

331

In [32]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_nonscored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_nonscored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.004823565553260081


CV log_loss:  0.014761779358699672
CV log_loss:  0.014519859174255039
CV log_loss:  0.014525173864593479
CV log_loss:  0.014354930596928602 # 3 umap features
CV log_loss:  0.014353604854355429 # more umap features
CV log_loss:  0.01436484670778641 # more hidden nodes

In [33]:

EPOCHS = 25
# NFOLDS = 5


In [34]:
# sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
# sub.to_csv('submission.csv', index=False)

In [35]:
nonscored_target = [c for c in train[train_targets_nonscored.columns] if c != "sig_id"]

In [36]:
nonscored_target

['abc_transporter_expression_enhancer',
 'abl_inhibitor',
 'ace_inhibitor',
 'acetylcholine_release_enhancer',
 'adenosine_kinase_inhibitor',
 'adenylyl_cyclase_inhibitor',
 'age_inhibitor',
 'alcohol_dehydrogenase_inhibitor',
 'aldehyde_dehydrogenase_activator',
 'aldose_reductase_inhibitor',
 'ampk_inhibitor',
 'androgen_biosynthesis_inhibitor',
 'angiotensin_receptor_agonist',
 'antacid',
 'anthelmintic',
 'antipruritic',
 'antirheumatic_drug',
 'antiseptic',
 'antispasmodic',
 'antithyroid_agent',
 'antitussive',
 'anxiolytic',
 'ap_inhibitor',
 'apoptosis_inhibitor',
 'arf_inhibitor',
 'aryl_hydrocarbon_receptor_agonist',
 'aryl_hydrocarbon_receptor_antagonist',
 'aspartic_protease_inhibitor',
 'atherogenesis_inhibitor',
 'atherosclerosis_formation_inhibitor',
 'atp-sensitive_potassium_channel_agonist',
 'atp-sensitive_potassium_channel_inhibitor',
 'atp_channel_blocker',
 'atp_citrase_lyase_inhibitor',
 'autophagy_inducer',
 'axl_kinase_inhibitor',
 'bacterial_atpase_inhibitor',


In [37]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train_nonscore_pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test_nonscore_pred.pkl")

In [38]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [39]:
# from sklearn.preprocessing import QuantileTransformer

for col in (nonscored_target):

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer, f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")
    else:
        transformer = pd.read_pickle(f"{MODEL_DIR}/{NB}_{col}_quantile_nonscored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [40]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [41]:
train

Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0,0,0,0,0,0,0,0,0,0
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0,0,0,0,0,0,0,0,0,0
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0,0,0,0,0,0,0,0,0,0
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0,0,0,0,0,0,0,0,0,0


In [42]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,24,D1,1.146806,0.902075,-0.418339,-0.961202,-0.254770,-1.021300,-1.369236,...,0,0,0,0,0,0,0,0,0,4
1,id_000779bfc,72,D1,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,...,0,0,0,0,0,0,0,0,0,4
2,id_000a6266a,48,D1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,...,0,0,0,0,0,0,0,0,0,4
3,id_0015fd391,48,D1,-0.729866,-0.277163,-0.441200,0.766612,2.347817,-0.862761,-2.308829,...,0,0,0,0,0,0,0,0,0,4
4,id_001626bd3,72,D2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.247623,-1.231184,0.221572,-0.354096,-0.332073,0.570635,-0.150125,...,0,0,0,0,0,0,0,0,0,4
21944,id_fffb1ceed,24,D2,0.217613,-0.027031,-0.237430,-0.787215,-0.677817,0.919474,0.742866,...,0,0,0,0,0,0,0,0,0,0
21945,id_fffb70c0c,24,D2,-1.914666,0.581880,-0.588706,1.303439,-1.009079,0.852202,-0.302814,...,0,0,0,0,0,0,0,0,0,1
21946,id_fffcb9e7c,24,D1,0.826302,0.411235,0.433297,0.307575,1.075324,-0.024425,0.051483,...,0,0,0,0,0,0,0,0,0,1


In [43]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 1622)
(21948, 1623)
(3624, 1416)
(21948, 207)
(3982, 207)


In [44]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
#     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
#     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

# --------------------- Normalize ---------------------
#     for col in GENES:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#     for col in CELLS:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#--------------------- Removing Skewness ---------------------
#     for col in GENES + CELLS:
#         if(abs(data[col].skew()) > 0.75):
            
#             if(data[col].skew() < 0): # neg-skewness
#                 data[col] = data[col].max() - data[col] + 1
#                 data[col] = np.sqrt(data[col])
            
#             else:
#                 data[col] = np.sqrt(data[col])
    
    return data

In [45]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

1418

In [46]:
num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=2048
# hidden_size=4096
# hidden_size=9192

In [47]:
def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.2, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler,loss_fn,trainloader, DEVICE)
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"model/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,

    )
    
    model.load_state_dict(torch.load(f"model/{NB}-scored1-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions

In [48]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [49]:
SEED = [940, 1513, 1269,1392,1119,1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.7134326108987781
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.6106544733047485
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.20464910828656907
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.023062731967204146
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.020830361496495163
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.019220005669113662
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.018650088162309883
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.018326523765507672
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.01793761108664499
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.01863717448173298
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.017398776554003143
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.0172521504573524
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.016641485558795757
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.016687922231439088
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.016411113390780014
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.01761202183034685
SE

In [50]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [51]:
len(target_cols)

206

In [52]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.014299113941268274


- CV log_loss:  0.014761779358699672
- CV log_loss:  0.014519859174255039
- CV log_loss:  0.014525173864593479
- CV log_loss:  0.014354930596928602 # 3 umap features
- CV log_loss:  0.014353604854355429 # more umap features
- CV log_loss:  0.01436484670778641 # more hidden nodes
- CV log_loss:  0.014344688083211073
  - using predicted unscored targets as feature 
- CV log_loss:  0.013368097791623873
  - using given unscored targets as feature
  - bad in public lb
- CV log_loss:  0.01434373547175235
  - rankgauss predicted unscored targets
- CV log_loss:  0.014346100008158216
  - unscored targets pca/umap
- CV log_loss:  0.014328486629791769
  - NFOLDS=10, Epoch=20
- CV log_loss:  0.014299741080816082
  - NFOLDS=10, Epoch=20, 25
- CV log_loss:  0.014311301224480969
  - NFOLDS=10, Epoch=25
- CV log_loss:  0.01429269446076626
  - NFOLDS=10, Epoch=15, 25

In [53]:
# train = pd.read_pickle(f"../interim/23-train-score-pred.pkl")
# test = pd.read_pickle(f"../interim/23-test-score-pred.pkl")

In [54]:
train = pd.read_pickle(f"{INT_DIR}/{NB}-train-score-pred.pkl")
test = pd.read_pickle(f"{INT_DIR}/{NB}-test-score-pred.pkl")

In [55]:
EPOCHS = 25
# NFOLDS = 5

In [56]:
PMIN = 0.0005
PMAX = 0.9995
for c in train_targets_scored.columns:
    if c != "sig_id":
        train_targets_scored[c] = np.maximum(PMIN, np.minimum(PMAX, train_targets_scored[c]))

In [57]:
train_targets_scored.columns

Index(['sig_id', '5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',
       'acat_inhibitor', 'acetylcholine_receptor_agonist',
       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',
       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',
       'adenylyl_cyclase_activator',
       ...
       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
      dtype='object', length=207)

In [58]:
train = train[train_targets_scored.columns]
train.columns = [c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c for c in train.columns]

In [59]:
test = test[train_targets_scored.columns]
test.columns = [c + "_pred" if (c != 'sig_id' and c in train_targets_scored.columns) else c for c in test.columns]

In [60]:
train

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,tropomyosin_receptor_kinase_inhibitor_pred,trpv_agonist_pred,trpv_antagonist_pred,tubulin_inhibitor_pred,tyrosine_kinase_inhibitor_pred,ubiquitin_specific_protease_inhibitor_pred,vegfr_inhibitor_pred,vitamin_b_pred,vitamin_d_receptor_agonist_pred,wnt_inhibitor_pred
0,id_000644bb2,0.000650,0.000407,0.001056,0.014523,0.046037,0.005095,0.004229,0.003785,0.000288,...,0.000428,0.000403,0.002018,0.001835,0.000876,0.000351,0.000769,0.001819,0.000285,0.001270
1,id_000779bfc,0.000470,0.000582,0.001193,0.016051,0.015803,0.004169,0.002928,0.003964,0.000360,...,0.000623,0.001061,0.001272,0.003519,0.001297,0.000291,0.001067,0.002909,0.000723,0.001952
2,id_000a6266a,0.001245,0.002134,0.001011,0.003338,0.009875,0.002004,0.001475,0.006138,0.000762,...,0.000306,0.001553,0.002890,0.005609,0.016152,0.000584,0.044607,0.004205,0.000281,0.001548
3,id_0015fd391,0.000305,0.000633,0.001642,0.010402,0.010414,0.001913,0.002580,0.002211,0.000266,...,0.000572,0.002320,0.001868,0.038662,0.003058,0.000605,0.001781,0.001963,0.000244,0.000547
4,id_001626bd3,0.000325,0.000588,0.002882,0.011430,0.010408,0.001821,0.005062,0.002897,0.000520,...,0.000811,0.001467,0.001869,0.002159,0.001477,0.000513,0.001180,0.001949,0.000463,0.001167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,0.003864,0.002146,0.000467,0.005645,0.033893,0.003897,0.001029,0.003164,0.000197,...,0.000412,0.000283,0.003942,0.000471,0.000543,0.000675,0.004110,0.000812,0.000521,0.000377
21944,id_fffb1ceed,0.001501,0.001076,0.001139,0.011063,0.033664,0.004069,0.002337,0.004457,0.000299,...,0.000452,0.000299,0.002046,0.000932,0.001383,0.000313,0.001904,0.001251,0.000460,0.001185
21945,id_fffb70c0c,0.000865,0.000797,0.001951,0.003430,0.004149,0.003843,0.001893,0.011016,0.000831,...,0.000403,0.001359,0.002742,0.000531,0.008149,0.000254,0.002202,0.001224,0.006601,0.004503
21946,id_fffcb9e7c,0.000276,0.000295,0.000341,0.002052,0.002650,0.000875,0.000743,0.001091,0.000100,...,0.000130,0.000249,0.000701,0.001685,0.002266,0.000156,0.001131,0.000391,0.000137,0.000461


In [61]:
# use nonscored target in the given file as feature
# if comment out below, use predicted nonscored target
# train = train.drop(nonscored_target, axis=1)
# train = train.merge(train_targets_nonscored, on="sig_id")
# train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_scored, on='sig_id')
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# target = train[train_targets_scored.columns]
target = train[train_targets_scored.columns]

In [62]:
# train["cp_time"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# train["cp_dose"] = train_features[train_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]
# test["cp_time"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_time"]
# test["cp_dose"] = test_features[test_features["cp_type"]=="trt_cp"].reset_index(drop=True)["cp_dose"]

In [63]:
from sklearn.preprocessing import QuantileTransformer

scored_target_pred = [c + "_pred" for c in train_targets_scored.columns if c != 'sig_id']

for col in (scored_target_pred):

#     transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
#     transformer.fit(raw_vec)
    if IS_TRAIN:
        transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        transformer.fit(raw_vec)
        pd.to_pickle(transformer, f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")
    else:
        transformer = pd.read_pickle(f"{MODEL_DIR}/{NB}_{col}_quantile_scored.pkl")

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [64]:
# train = train.drop('cp_type', axis=1)
# test = test.drop('cp_type', axis=1)

In [65]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

In [66]:
train

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0.263800,-0.686643,-0.035590,0.852882,1.956241,0.714940,1.089614,0.156629,0.208169,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
1,id_000779bfc,-0.172458,-0.211519,0.186410,0.985108,0.135584,0.378215,0.605459,0.220750,0.576051,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
2,id_000a6266a,1.077243,1.449984,-0.120273,-0.697052,-0.349476,-0.517079,-0.257762,0.825560,1.587220,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
3,id_0015fd391,-0.716649,-0.086812,0.781953,0.418828,-0.307002,-0.553324,0.433879,-0.533703,0.076889,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
4,id_001626bd3,-0.647452,-0.197038,1.767731,0.538918,-0.307476,-0.589463,1.314964,-0.210142,1.128933,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.161669,1.457933,-1.363605,-0.255803,1.383198,0.270271,-0.631043,-0.096340,-0.455718,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21944,id_fffb1ceed,1.292156,0.637390,0.100840,0.497966,1.368214,0.338321,0.300445,0.383632,0.269661,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21945,id_fffb70c0c,0.644685,0.244274,1.091844,-0.675820,-0.859166,0.248729,0.032107,1.555131,1.674891,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
21946,id_fffcb9e7c,-0.816541,-1.027311,-1.831150,-1.051518,-1.187846,-1.045939,-0.918460,-1.097907,-1.324022,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005


In [67]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds



Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.263800,-0.686643,-0.035590,0.852882,1.956241,0.714940,1.089614,0.156629,0.208169,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.172458,-0.211519,0.186410,0.985108,0.135584,0.378215,0.605459,0.220750,0.576051,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.077243,1.449984,-0.120273,-0.697052,-0.349476,-0.517079,-0.257762,0.825560,1.587220,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.716649,-0.086812,0.781953,0.418828,-0.307002,-0.553324,0.433879,-0.533703,0.076889,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.647452,-0.197038,1.767731,0.538918,-0.307476,-0.589463,1.314964,-0.210142,1.128933,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.161669,1.457933,-1.363605,-0.255803,1.383198,0.270271,-0.631043,-0.096340,-0.455718,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.292156,0.637390,0.100840,0.497966,1.368214,0.338321,0.300445,0.383632,0.269661,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,0.644685,0.244274,1.091844,-0.675820,-0.859166,0.248729,0.032107,1.555131,1.674891,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.816541,-1.027311,-1.831150,-1.051518,-1.187846,-1.045939,-0.918460,-1.097907,-1.324022,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [68]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21948, 413)
(21948, 414)
(3624, 207)
(21948, 207)
(3982, 207)


In [69]:
folds

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.263800,-0.686643,-0.035590,0.852882,1.956241,0.714940,1.089614,0.156629,0.208169,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.172458,-0.211519,0.186410,0.985108,0.135584,0.378215,0.605459,0.220750,0.576051,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.077243,1.449984,-0.120273,-0.697052,-0.349476,-0.517079,-0.257762,0.825560,1.587220,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.716649,-0.086812,0.781953,0.418828,-0.307002,-0.553324,0.433879,-0.533703,0.076889,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.647452,-0.197038,1.767731,0.538918,-0.307476,-0.589463,1.314964,-0.210142,1.128933,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.161669,1.457933,-1.363605,-0.255803,1.383198,0.270271,-0.631043,-0.096340,-0.455718,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.292156,0.637390,0.100840,0.497966,1.368214,0.338321,0.300445,0.383632,0.269661,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,0.644685,0.244274,1.091844,-0.675820,-0.859166,0.248729,0.032107,1.555131,1.674891,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.816541,-1.027311,-1.831150,-1.051518,-1.187846,-1.045939,-0.918460,-1.097907,-1.324022,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [70]:
def process_data(data):
    
#     data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
#     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2, 0:0, 1:1, 2:2})
#     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1, 0:0, 1:1})

# --------------------- Normalize ---------------------
#     for col in GENES:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#     for col in CELLS:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#--------------------- Removing Skewness ---------------------
#     for col in GENES + CELLS:
#         if(abs(data[col].skew()) > 0.75):
            
#             if(data[col].skew() < 0): # neg-skewness
#                 data[col] = data[col].max() - data[col] + 1
#                 data[col] = np.sqrt(data[col])
            
#             else:
#                 data[col] = np.sqrt(data[col])
    
    return data

In [71]:
feature_cols = [c for c in folds.columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

206

In [72]:
feature_cols

['5-alpha_reductase_inhibitor_pred',
 '11-beta-hsd1_inhibitor_pred',
 'acat_inhibitor_pred',
 'acetylcholine_receptor_agonist_pred',
 'acetylcholine_receptor_antagonist_pred',
 'acetylcholinesterase_inhibitor_pred',
 'adenosine_receptor_agonist_pred',
 'adenosine_receptor_antagonist_pred',
 'adenylyl_cyclase_activator_pred',
 'adrenergic_receptor_agonist_pred',
 'adrenergic_receptor_antagonist_pred',
 'akt_inhibitor_pred',
 'aldehyde_dehydrogenase_inhibitor_pred',
 'alk_inhibitor_pred',
 'ampk_activator_pred',
 'analgesic_pred',
 'androgen_receptor_agonist_pred',
 'androgen_receptor_antagonist_pred',
 'anesthetic_-_local_pred',
 'angiogenesis_inhibitor_pred',
 'angiotensin_receptor_antagonist_pred',
 'anti-inflammatory_pred',
 'antiarrhythmic_pred',
 'antibiotic_pred',
 'anticonvulsant_pred',
 'antifungal_pred',
 'antihistamine_pred',
 'antimalarial_pred',
 'antioxidant_pred',
 'antiprotozoal_pred',
 'antiviral_pred',
 'apoptosis_stimulant_pred',
 'aromatase_inhibitor_pred',
 'atm_kina

In [73]:
folds

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor_pred,11-beta-hsd1_inhibitor_pred,acat_inhibitor_pred,acetylcholine_receptor_agonist_pred,acetylcholine_receptor_antagonist_pred,acetylcholinesterase_inhibitor_pred,adenosine_receptor_agonist_pred,adenosine_receptor_antagonist_pred,adenylyl_cyclase_activator_pred,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,0.263800,-0.686643,-0.035590,0.852882,1.956241,0.714940,1.089614,0.156629,0.208169,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,2
1,id_000779bfc,-0.172458,-0.211519,0.186410,0.985108,0.135584,0.378215,0.605459,0.220750,0.576051,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
2,id_000a6266a,1.077243,1.449984,-0.120273,-0.697052,-0.349476,-0.517079,-0.257762,0.825560,1.587220,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
3,id_0015fd391,-0.716649,-0.086812,0.781953,0.418828,-0.307002,-0.553324,0.433879,-0.533703,0.076889,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3
4,id_001626bd3,-0.647452,-0.197038,1.767731,0.538918,-0.307476,-0.589463,1.314964,-0.210142,1.128933,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,2.161669,1.457933,-1.363605,-0.255803,1.383198,0.270271,-0.631043,-0.096340,-0.455718,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21944,id_fffb1ceed,1.292156,0.637390,0.100840,0.497966,1.368214,0.338321,0.300445,0.383632,0.269661,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0
21945,id_fffb70c0c,0.644685,0.244274,1.091844,-0.675820,-0.859166,0.248729,0.032107,1.555131,1.674891,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,4
21946,id_fffcb9e7c,-0.816541,-1.027311,-1.831150,-1.051518,-1.187846,-1.045939,-0.918460,-1.097907,-1.324022,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,3


In [74]:
EPOCHS = 25
num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1024
# hidden_size=4096
# hidden_size=9192

In [75]:
def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = (folds)
    test_ = (test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.2, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler,loss_fn, trainloader, DEVICE)
        print(f"SEED: {seed}, FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"SEED: {seed} ,FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"model/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    model.load_state_dict(torch.load(f"model/{NB}-scored2-SEED{seed}-FOLD{fold}_.pth"))
    model.to(DEVICE)
    
 #   if not IS_TRAIN:
   # valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
   # oof[val_idx] = valid_preds     
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions

In [76]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [77]:
SEED = [940, 1513, 1269,1392,1119,1303]  #<-- Update
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_start = time.time()

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    print(f"elapsed time: {time.time() - time_start}")

train[target_cols] = oof
test[target_cols] = predictions

SEED: 940, FOLD: 0, EPOCH: 0, train_loss: 0.7181326176809228
SEED: 940 ,FOLD: 0, EPOCH: 0, valid_loss: 0.6357580423355103
SEED: 940, FOLD: 0, EPOCH: 1, train_loss: 0.2647488739719425
SEED: 940 ,FOLD: 0, EPOCH: 1, valid_loss: 0.03140339545077748
SEED: 940, FOLD: 0, EPOCH: 2, train_loss: 0.024741355140787968
SEED: 940 ,FOLD: 0, EPOCH: 2, valid_loss: 0.021632916397518583
SEED: 940, FOLD: 0, EPOCH: 3, train_loss: 0.021218745793769325
SEED: 940 ,FOLD: 0, EPOCH: 3, valid_loss: 0.02018266771402624
SEED: 940, FOLD: 0, EPOCH: 4, train_loss: 0.02037945794670478
SEED: 940 ,FOLD: 0, EPOCH: 4, valid_loss: 0.019939849774042766
SEED: 940, FOLD: 0, EPOCH: 5, train_loss: 0.019866417310592056
SEED: 940 ,FOLD: 0, EPOCH: 5, valid_loss: 0.019616762383116618
SEED: 940, FOLD: 0, EPOCH: 6, train_loss: 0.01954017350099225
SEED: 940 ,FOLD: 0, EPOCH: 6, valid_loss: 0.019461500458419323
SEED: 940, FOLD: 0, EPOCH: 7, train_loss: 0.01933725372604702
SEED: 940 ,FOLD: 0, EPOCH: 7, valid_loss: 0.0194080283658372
SEED:

In [78]:
train.to_pickle(f"{INT_DIR}/{NB}-train-score-stack-pred.pkl")
test.to_pickle(f"{INT_DIR}/{NB}-test-score-stack-pred.pkl")

In [79]:
train[target_cols] = np.maximum(PMIN, np.minimum(PMAX, train[target_cols]))
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_true = y_true > 0.5
y_pred = valid_results[target_cols].values

y_pred = np.minimum(SMAX, np.maximum(SMIN, y_pred))

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.014311818899129448


In [80]:
# for c in test.columns:
#     if c != "sig_id":
#         test[c] = np.maximum(PMIN, np.minimum(PMAX, test[c]))

sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission_kibuna_nn.csv', index=False)

In [81]:
sub

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001229,0.001587,0.002212,0.011637,0.019790,0.004841,0.002894,0.005125,0.000671,...,0.000972,0.001236,0.002852,0.001953,0.001773,0.000963,0.001984,0.001903,0.004005,0.001596
1,id_001897cda,0.000459,0.001623,0.001309,0.003625,0.001444,0.001768,0.003833,0.007138,0.003316,...,0.000744,0.000494,0.006207,0.000485,0.015149,0.000456,0.004326,0.000790,0.001170,0.002056
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.001371,0.001614,0.001500,0.009565,0.018549,0.005714,0.003507,0.004230,0.000601,...,0.000741,0.001247,0.004051,0.005778,0.005729,0.000969,0.002270,0.002191,0.000885,0.004552
4,id_0027f1083,0.001542,0.001852,0.002821,0.016203,0.023527,0.004723,0.003847,0.002988,0.000666,...,0.001092,0.000919,0.004059,0.002832,0.001974,0.001008,0.001774,0.001788,0.000818,0.001200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.001295,0.001802,0.001788,0.005092,0.008787,0.003685,0.002049,0.003876,0.000822,...,0.000969,0.004864,0.003405,0.232095,0.006727,0.001556,0.004299,0.001536,0.000915,0.001424
3978,id_ff925dd0d,0.007792,0.002940,0.001483,0.011931,0.019176,0.006195,0.004788,0.004155,0.001043,...,0.001095,0.001045,0.003679,0.003319,0.003556,0.001231,0.006432,0.002863,0.001017,0.002394
3979,id_ffb710450,0.001687,0.001814,0.001956,0.012075,0.022226,0.007054,0.003338,0.004425,0.000723,...,0.000958,0.001348,0.003200,0.004727,0.002361,0.000917,0.002261,0.002237,0.001003,0.001447
3980,id_ffbb869f2,0.002892,0.001610,0.001380,0.020515,0.021964,0.006312,0.008292,0.003225,0.000773,...,0.000808,0.000770,0.003498,0.002070,0.001511,0.000872,0.002085,0.002186,0.001107,0.003523
