## Process the data

#### 1.Split the data

In [1]:
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np

In [2]:
Path = './'
df = pd.read_csv(Path + 'train_targets_scored.csv')

In [3]:
df.head(2)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.loc[:,'kfold'] = -1

In [5]:
df = df.sample(frac=1).reset_index(drop=True)
targets = df.drop(['sig_id'],axis=1).values

In [6]:
mskf = MultilabelStratifiedKFold(n_splits=5,shuffle=False, random_state=None)



In [7]:
df.head(2)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_ae71951bd,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,id_e88e993ff,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [8]:
for fold, (trn_, val_) in enumerate(mskf.split(X=df,y=targets)):
    df.loc[val_,"kfold"] = fold

In [9]:
df.head(2)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_ae71951bd,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1,id_e88e993ff,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
df['kfold'].value_counts()

4    4763
3    4763
2    4763
0    4763
1    4762
Name: kfold, dtype: int64

In [11]:
df.to_csv('train_fold.csv',index = False)

In [12]:
def add_dummies(data, column):
    ohe = pd.get_dummies(data[column])
    ohe_columns = [f"{column}_{c}" for c in ohe.columns]
    ohe.columns = ohe_columns
    data = data.drop(column, axis=1)
    data = data.join(ohe)
    return data

In [13]:
def process_data(df):
    df = add_dummies(df, 'cp_time')
    df = add_dummies(df, 'cp_type')
    df = add_dummies(df, 'cp_dose')
    
    return df

## Modeling

In [14]:
import torch
import torch.nn as nn

In [15]:
class MoaDataset():
    def __init__(self, dataset, features):
        self.dataset = dataset
        self.features = features
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self,item):
        return {
            "x" : torch.tensor(self.dataset[item,:],dtype=torch.float),
            "y" : torch.tensor(self.features[item,:],dtype=torch.float)
        }

In [16]:
#from tqdm.notebook import tqdm
from tqdm import tqdm 

In [35]:
class Engine():
    def __init__(self, model, optimizer, device):
        self.model = model
        self.optimizer = optimizer
        self.device = device
    
    def loss_fn(self, targets, outputs):
        return nn.BCEWithLogitsLoss()(outputs,targets)

    def train(self, data_loader):
        self.model.train()
        final_loss = 0 
        for data in tqdm(data_loader):
            self.optimizer.zero_grad()
            inputs = data['x'].to(self.device)
            targets = data['y'].to(self.device)
            outputs = self.model(inputs)
            loss = self.loss_fn(targets, outputs)
            
            loss.backward()
            self.optimizer.step()
            
            final_loss += loss.item()
        print('Training loss : {}'.format(final_loss))
        return final_loss / len(data_loader)
    
    def validation(self, data_loader):
        self.model.eval()
        final_loss = 0 
        for data in tqdm(data_loader):
            inputs = data['x'].to(self.device)
            targets = data['y'].to(self.device)
            
            outputs = self.model(inputs)
            loss = self.loss_fn(targets, outputs)
            final_loss += loss.item()

        print('validation loss : {}'.format(final_loss))
        return final_loss / len(data_loader)
            

In [36]:
EPOCHS = 100
DEVICE = 'cpu'

In [37]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(num_features, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),
            nn.PReLU(),
            
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),
            nn.PReLU(),
            
            nn.Linear(1024, num_targets)
            )
    def forward(self, x):
        x = self.model(x)
        return x
        

In [38]:
def run_training(fold):
    df = pd.read_csv(Path + 'train_features.csv')
    df = process_data(df)
    folds = pd.read_csv(Path + 'train_fold.csv')
    
    targets = folds.drop(['sig_id','kfold'], axis =1 ).columns
    features = df.drop(['sig_id'],axis=1).columns
    
    df = df.merge(folds, on = 'sig_id', how='left')
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    X_train = train_df[features].values
    y_train = train_df[targets].values
    
    X_valid = valid_df[features].values
    y_valid = valid_df[targets].values
    
    
    train_dataset = MoaDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(
                    train_dataset, batch_size=64, num_workers = 8)
    
    valid_dataset = MoaDataset(X_valid, y_valid)
    valid_loader = torch.utils.data.DataLoader(
                    valid_dataset, batch_size=64, num_workers = 8)
    
    model = Model(X_train.shape[1],y_train.shape[1])
    model = model.to(DEVICE)
    
    optmizer = torch.optim.Adam(model.parameters(), lr = 3e-4)
    # scheduler = 
    eng = Engine(model, optmizer, DEVICE)
    
    for epoch in (range(EPOCHS)):
        print('epoch : {}'.format(epoch))
        train_loss = eng.train(train_loader)
        valid_loss = eng.validation(valid_loader)
    
    print('for fold : {} train loss = {}, validation loss : {} '.format(fold, train_loss, valid_loss))

In [39]:
run_training(1)

  0%|          | 0/298 [00:00<?, ?it/s]

epoch : 0


100%|██████████| 298/298 [00:29<00:00, 10.22it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 17.842223745770752


100%|██████████| 75/75 [00:00<00:00, 94.02it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.6869843900203705
epoch : 1


100%|██████████| 298/298 [00:28<00:00, 10.62it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 5.165126567706466


100%|██████████| 75/75 [00:00<00:00, 85.70it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.4321262668818235
epoch : 2


100%|██████████| 298/298 [00:32<00:00,  9.21it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 4.660187835805118


100%|██████████| 75/75 [00:00<00:00, 75.21it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.3134476821869612
epoch : 3


100%|██████████| 298/298 [00:37<00:00,  7.91it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 4.324050208553672


100%|██████████| 75/75 [00:01<00:00, 58.09it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2829719725996256
epoch : 4


100%|██████████| 298/298 [00:32<00:00,  9.17it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 4.010077241808176


100%|██████████| 75/75 [00:00<00:00, 80.09it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.255888937972486
epoch : 5


100%|██████████| 298/298 [00:35<00:00,  8.46it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 3.7322330782189965


100%|██████████| 75/75 [00:00<00:00, 94.29it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2352578295394778
epoch : 6


100%|██████████| 298/298 [00:29<00:00, 10.27it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 3.4235507054254413


100%|██████████| 75/75 [00:00<00:00, 78.98it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2385753197595477
epoch : 7


100%|██████████| 298/298 [00:28<00:00, 10.51it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 3.1150327613577247


100%|██████████| 75/75 [00:01<00:00, 60.68it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2172583993524313
epoch : 8


100%|██████████| 298/298 [00:32<00:00,  9.09it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 2.8199249980971217


100%|██████████| 75/75 [00:01<00:00, 47.26it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2288362383842468
epoch : 9


100%|██████████| 298/298 [00:30<00:00,  9.73it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 2.515459395479411


100%|██████████| 75/75 [00:00<00:00, 75.52it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2263402957469225
epoch : 10


100%|██████████| 298/298 [00:30<00:00,  9.71it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 2.199304688256234


100%|██████████| 75/75 [00:01<00:00, 62.29it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2537692692130804
epoch : 11


100%|██████████| 298/298 [00:29<00:00, 10.17it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 1.832940757041797


100%|██████████| 75/75 [00:00<00:00, 78.02it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.2850761720910668
epoch : 12


100%|██████████| 298/298 [00:29<00:00, 10.04it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 1.5605097154621035


100%|██████████| 75/75 [00:01<00:00, 69.12it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.308481597341597
epoch : 13


100%|██████████| 298/298 [00:27<00:00, 10.72it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 1.3640134106390178


100%|██████████| 75/75 [00:00<00:00, 100.89it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.3643560772761703
epoch : 14


100%|██████████| 298/298 [00:30<00:00,  9.83it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 1.0953534272266552


100%|██████████| 75/75 [00:00<00:00, 77.42it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.38292783126235
epoch : 15


100%|██████████| 298/298 [00:28<00:00, 10.43it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.9566308018984273


100%|██████████| 75/75 [00:00<00:00, 84.41it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.4077203692868352
epoch : 16


100%|██████████| 298/298 [00:27<00:00, 10.71it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.8073460597079247


100%|██████████| 75/75 [00:00<00:00, 94.81it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.48606470040977
epoch : 17


100%|██████████| 298/298 [00:28<00:00, 10.34it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.659999102470465


100%|██████████| 75/75 [00:00<00:00, 79.27it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.5044068899005651
epoch : 18


100%|██████████| 298/298 [00:27<00:00, 10.95it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.6376985698007047


100%|██████████| 75/75 [00:00<00:00, 83.36it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.5123567944392562
epoch : 19


100%|██████████| 298/298 [00:39<00:00,  7.59it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.6420274075353518


100%|██████████| 75/75 [00:01<00:00, 39.51it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.5974984420463443
epoch : 20


100%|██████████| 298/298 [00:37<00:00,  8.02it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.5867961940239184


100%|██████████| 75/75 [00:01<00:00, 73.59it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.5887472154572606
epoch : 21


100%|██████████| 298/298 [00:34<00:00,  8.61it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.4854714989778586


100%|██████████| 75/75 [00:01<00:00, 61.01it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.6729095354676247
epoch : 22


100%|██████████| 298/298 [00:45<00:00,  6.52it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.49503465567249805


100%|██████████| 75/75 [00:01<00:00, 70.22it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.621821311302483
epoch : 23


100%|██████████| 298/298 [00:29<00:00,  9.99it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.4723901240504347


100%|██████████| 75/75 [00:00<00:00, 75.21it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.7270211838185787
epoch : 24


100%|██████████| 298/298 [00:28<00:00, 10.40it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.4560606610320974


100%|██████████| 75/75 [00:01<00:00, 61.07it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.733665277250111
epoch : 25


100%|██████████| 298/298 [00:32<00:00,  9.18it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.4285120440181345


100%|██████████| 75/75 [00:01<00:00, 62.82it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.8058872744441032
epoch : 26


100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.3806961399677675


100%|██████████| 75/75 [00:01<00:00, 38.51it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.8251715181395411
epoch : 27


100%|██████████| 298/298 [00:36<00:00,  8.07it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.3814956918358803


100%|██████████| 75/75 [00:00<00:00, 78.51it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.7689780350774527
epoch : 28


100%|██████████| 298/298 [00:30<00:00,  9.92it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.39017018588492647


100%|██████████| 75/75 [00:00<00:00, 93.51it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.9031521519646049
epoch : 29


100%|██████████| 298/298 [00:33<00:00,  8.86it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.3689590831636451


100%|██████████| 75/75 [00:00<00:00, 87.27it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.886815964244306
epoch : 30


100%|██████████| 298/298 [00:32<00:00,  9.22it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.33516360659268685


100%|██████████| 75/75 [00:01<00:00, 64.43it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.851321030408144
epoch : 31


100%|██████████| 298/298 [00:30<00:00,  9.81it/s]
  0%|          | 0/75 [00:00<?, ?it/s]

Training loss : 0.3086437392339576


100%|██████████| 75/75 [00:01<00:00, 69.92it/s]
  0%|          | 0/298 [00:00<?, ?it/s]

validation loss : 1.9099840074777603
epoch : 32


  2%|▏         | 5/298 [00:01<00:58,  4.99it/s]


KeyboardInterrupt: 