In [None]:
!nvidia-smi

# set up

In [None]:
!pip install transformers==4.5.0
!pip install pytorch-lightning

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
!mkdir -p drive
!google-drive-ocamlfuse drive

In [1]:
%cd '/content/drive/MyDrive/Hoc'

/content/drive/MyDrive/Hoc


# read data

In [2]:
import numpy as np
import pandas as pd
import csv
from transformers import AutoTokenizer, AutoModel , AutoConfig
import torch
from torch.utils.data import Dataset  
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch.nn as nn


In [3]:
class HocDataset(Dataset):
    def __init__(self, file_path, pretrained_tokenizer_name_or_path = "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12" , has_label =True):
      
      # has label or not
      self.has_label = has_label

      #  map labelto number
      self.label_map = self.get_label_map()

      # read source tsv file
      self.data,self.label = self.read_data( file_path )

      self.tokenizer  = AutoTokenizer.from_pretrained( pretrained_tokenizer_name_or_path )


    def read_data( self , file_path):

      # read each line from tsv file
      lines = self.read_tsv(file_path)
      
      # get x(text) and y( mapped label num)
      X,y = [] , []
      for line in lines:
        X.append(line[1]) # add doc to X
        if self.has_label:  
          y.append( [ self.label_map[_y]  for _y  in line[0].split(',') ] )  # add converted label to y 

      return X,y

    def read_tsv( self ,  file_path, quotechar=None):
      """Reads a tab separated value file."""
      with open(file_path, "r") as f:

        reader = csv.reader(f, delimiter="\t", quotechar=quotechar) # skip csv header
        next(reader, None)
        
        lines = []
        for line in reader:
          lines.append(line)
        return lines     

    # return  label map to map [0_0,1_0, ...] to  [0 , 1 , 2]
    @classmethod
    def get_label_map( self ,  num_aspects = 10  ,  aspect_value_list = [0,1] ):
  
      label_map = {}  
      label_idx = 0

      for i in range(num_aspects):
        for value in aspect_value_list:
          label_map[str(i) + "_" + str(value)] = label_idx
          label_idx += 1

      return label_map


    def __getitem__(self, idx):
      
      token_tensor = self.tokenizer.encode_plus(  
                        self.data[idx]  ,max_length=128,
                        truncation=True,padding='max_length',
                        return_tensors ='pt' )

      for k in token_tensor:
        token_tensor[k] =  token_tensor[k].squeeze()

      if not self.has_label:
        return token_tensor , None

      # create label
      label = [0] * len(self.label_map.keys())

      for l in self.label[idx]:
        label[l] = 1

      return token_tensor , torch.tensor(  label  ,dtype= torch.float32 )

    def __len__(self):
        return len(self.data)

# model

In [4]:

class HocClassifier(pl.LightningModule):
    def __init__(self, pretrained_model_name_or_path = "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12",
                 pretrained_config_name_or_path =  "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12",
                 num_classes = 20 , learning_rate = 1e-5):
        super(HocClassifier, self).__init__()  

        # load pre trained model
        pretrained_config  = AutoConfig.from_pretrained( pretrained_config_name_or_path )
        self.pretrained_model = AutoModel.from_pretrained( pretrained_model_name_or_path  , config = pretrained_config)
        
        # classifier
        """
       self.decoder =  nn.Sequential(
                            nn.Linear(pretrained_config.hidden_size ,128),
                            nn.Dropout(0.5) , 
                            nn.ReLU() ,
                            nn.Linear(128 ,num_classes)
                        )
        """
        self.decoder =  nn.Sequential(
                            nn.Dropout(0.1),
                            nn.Linear(pretrained_config.hidden_size ,num_classes)
                        )
                
        
        self.learning_rate = learning_rate
        
    def forward(self,input_ids=None,attention_mask=None,token_type_ids=None):

        cls = self.pretrained_model(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)[1]
        pred = self.decoder(cls)
        return pred
    
    def configure_optimizers(self):
        # args.learning_rate
        return torch.optim.AdamW(self.parameters(), lr= self.learning_rate) 
      
    # train
    def training_step(self, batch, batch_idx):
        
        data,label = batch
        pred = self(**data)  
        batch_loss = nn.functional.binary_cross_entropy_with_logits( pred , label)

        return { 'loss' :  batch_loss }

    def training_step_end(self,output):
        
        # combine all device loss
        loss  = output['loss'].mean()
        self.log('train_loss' , loss)
        return { 'loss' : loss }

    def training_epoch_end(self , output):
        avg_loss = torch.stack( [ x['loss'] for x in output ] ).mean()
        self.log('avg_train_loss' , avg_loss)
    
    # val
    def validation_step(self, batch, batch_idx):

        data,label = batch
        pred = self(**data)  
        batch_loss = nn.functional.binary_cross_entropy_with_logits( pred , label)
        
        s1

        return { 'loss' :  batch_loss }

    def validation_epoch_end(self , output):

        avg_loss = torch.stack( [ x['loss'] for x in output ] ).mean()
        self.log('avg_val_loss' , avg_loss)

    def make_pred(self , data):

      # make class = N pred [0,1,0,0,1,0,...]

      for key in data:
        data[key] = data[key].to(self.device)
      pred = torch.sigmoid(self(**data))
      pred = pred.detach().cpu().numpy()
      return  pred

# argc

In [5]:

import argparse

parser = argparse.ArgumentParser([])

# train param
parser.add_argument('--batch-size', default=4, type=int)
parser.add_argument('--epoch', default=10, type=int)
parser.add_argument('--learning-rate', default=1e-5, type=float)    

# quick check for pl lightning
parser.add_argument('--fast-run', default=False, type=bool)

# train val test
parser.add_argument('--data-root' , default="./dataset" , type=str)
parser.add_argument('--do-train', default=True, type=bool)
parser.add_argument('--do-eval', default=True, type=bool)
parser.add_argument('--do-predict', default=False, type=bool)

# pretrained model
parser.add_argument('--pretrained-model', default= "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12" , type=str)
parser.add_argument('--pretrained-tokenizer', default= "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12" , type=str)
parser.add_argument('--pretrained-config', default= "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12" , type=str)
parser.add_argument('--numclasses' , default=20 , type = int)

args = parser.parse_args([]) 


# main

In [6]:
def get_model( pretrained_model_name_or_path , pretrained_config_name_or_path  , num_classes , learning_rate = 1e-5 , checkpoint_path = None):
  if checkpoint_path!= None:
    return HocClassifier().load_from_checkpoint( checkpoint_path , pretrained_model_name_or_path = pretrained_model_name_or_path ,  
                            pretrained_config_name_or_path = pretrained_config_name_or_path , num_classes = num_classes , learning_rate = learning_rate)
  else:
    return HocClassifier( pretrained_model_name_or_path ,  pretrained_config_name_or_path , num_classes , learning_rate)

In [7]:
import os
def get_train_val_loader( data_root ,   batch_size , pretrained_tokenizer , only_test = False):

  if only_test:
    test = HocDataset(  os.path.join( data_root ,  'test.tsv' ) , pretrained_tokenizer)
    test_loader = DataLoader(test , batch_size=batch_size , num_workers=2) 
    return test_loader

  train = HocDataset(  os.path.join( data_root ,  'train.tsv' ) , pretrained_tokenizer)
  train_loader = DataLoader(train , batch_size=batch_size , num_workers=2) 

  val = HocDataset(  os.path.join( data_root ,  'dev.tsv' ) , pretrained_tokenizer)
  val_loader = DataLoader( val , batch_size=batch_size , num_workers=2) 

  return train_loader , val_loader

# train

In [None]:
if args.do_train:
  print("train params ========")
  print(args)
  print("=====================")

  model_name = input('model name = ')
      
  train_loader , val_loader = get_train_val_loader( args.data_root , args.batch_size ,  args.pretrained_tokenizer )
      
  if args.do_eval:
    _monitor =  "avg_val_loss"
  else:
    _monitor =  "avg_train_loss"
      
  # save model according by _monitor   
  checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor=  _monitor, 
    dirpath = f'./saved_models/{model_name}',
    filename = model_name,
    mode = 'min')


  # log to tensorboard    
  tb_logger = pl.loggers.TensorBoardLogger(f'./lightning_logs/{model_name}/')


  trainer = pl.Trainer( auto_lr_find=False , fast_dev_run=args.fast_run ,gpus=[0] , max_epochs= args.epoch , 
                    accelerator='dp' , callbacks = [checkpoint_callback], logger=tb_logger  )

  # train
  model = get_model( args.pretrained_model , args.pretrained_config , args.numclasses , args.learning_rate    )

  if args.do_eval:
    # train and val
    trainer.fit(model , train_loader  , val_loader)
  else:
    # only train
    trainer.fit(model , train_loader)
      
  print(f"{model_name} done")

# test

In [8]:
model_path = "/content/drive/MyDrive/Hoc/saved_models/train-epo10/train-epo10.ckpt"
device = torch.device("cuda:0")

test_loader = get_train_val_loader( args.data_root , 8 ,  args.pretrained_tokenizer , only_test=True )

model = get_model( args.pretrained_model , args.pretrained_config , args.numclasses , args.learning_rate , checkpoint_path = model_path ).to(device)

In [9]:
def pred_reformat(pred):
  
  LABELS = ['activating invasion and metastasis', 'avoiding immune destruction',
          'cellular energetics', 'enabling replicative immortality', 'evading growth suppressors',
          'genomic instability and mutation', 'inducing angiogenesis', 'resisting cell death',
          'sustaining proliferative signaling', 'tumor promoting inflammation']

  threshold = 0.5

  result = [[]for i in range(len(pred))]
  for i in range(len(pred)):
    
    pred[i][pred[i] > threshold ] =  1
    pred[i][pred[i] < threshold ] =  0
    for j in range( 0 , len(pred[i]) ,2 ):
      if pred[i][j+1] == 1 and pred[i][j] == 0 :
        result[i].append( LABELS[int(j/2)] )
    result[i] = ','.join( set(result[i]) )
  return result

In [10]:
from eval_hoc import eval_hoc

def test_result( model , test_loader , label_file  ):
  model.eval()
  # get pred
  preds,expect = [] , []
  for i , batch in  enumerate(test_loader) :
    data,label = batch
    label = label.detach().numpy()
    model_pred = model.make_pred(data) 
    preds += [ p for p in model_pred]
    expect += [ p for p in label]
  
  ori_test = pd.read_csv( label_file  , sep='\t')
  ori_test['labels'] = pred_reformat(preds)
  ori_test.to_csv('./test_result/temp_test.tsv',sep='\t')

  eval_hoc("./label/test.tsv" , "./test_result/temp_test.tsv")

In [11]:
test_result( model , test_loader , "/content/drive/MyDrive/Hoc/label/test.tsv" )

Precision: 83.9
Recall   : 84.3
F1       : 84.1
