In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
best_sweep = [8,0.1,0.000005,0.001,8]
#dataset_name = "preprocessed_cadec"
dataset_name = "ADE_corpus_output3"

pretrained_bert = 'michiyasunaga/BioLinkBERT-base'
#pretrained_bert = 'dmis-lab/biobert-base-cased-v1.2'
#pretrained_bert = 'cimm-kzn/endr-bert'
#pretrained_bert = 'SpanBERT/spanbert-base-cased'
#pretrained_bert = 'emilyalsentzer/Bio_ClinicalBERT'
#pretrained_bert = 'dmis-lab/biobert-v1.1'
#pretrained_bert = 'bert-large-uncased'
#pretrained_bert = 'allenai/scibert_scivocab_uncased'
#pretrained_bert = 'allenai/biomed_roberta_base' 
#pretrained_bert = 'microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract'

optim_batch_size = best_sweep[0]
optim_learning_rate = best_sweep[2]
optim_epochs = best_sweep[4]
optim_dropout = best_sweep[1]
optim_weight_decay = best_sweep[3]

In [3]:
import pandas as pd
import datetime
import numpy as np
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForTokenClassification,BertConfig,AutoConfig,AutoModelForTokenClassification, AdamW, get_linear_schedule_with_warmup
from keras_preprocessing.sequence import pad_sequences
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report,performance_measure

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [4]:
df = pd.read_csv(f"{dataset_name}.csv")

pretrained_save = pretrained_bert.replace('/','_')

if dataset_name == "preprocessed_cadec":
    model_name = "CADEC_NER_"+pretrained_bert.split('/')[1]
else:
    model_name = "ADE_NER_"+pretrained_bert.split('/')[1]

In [6]:
def ret_list(text):
    return eval(text)
df['sentences']=df['sentences'].apply(ret_list)
df['labels']=df['labels'].apply(ret_list)

In [7]:
train,test = train_test_split(df,test_size=0.15,random_state=42)

In [8]:
len(train),len(test)

(3823, 675)

In [9]:
sentences = train['sentences'].to_list()
labels = train['labels'].to_list()

In [10]:
tokenizer = BertTokenizer.from_pretrained(pretrained_bert,do_lower_case=True)

In [11]:
TokenLength = [len(tokenizer.encode(' '.join(i),add_special_tokens=True)) for i in sentences]

In [12]:
print("Minimum length: {:,} tokens".format(min(TokenLength)))
print("Maximum length: {:,} tokens".format(max(TokenLength)))
print("Average length: {:,} tokens".format(int(np.median(TokenLength))))

Minimum length: 4 tokens
Maximum length: 103 tokens
Average length: 24 tokens


In [13]:
unique_labels = set()
for label in labels:
    for l in label:
        unique_labels.add(l)

In [14]:
data_tags=list(unique_labels)
data_tags

['I-drug', 'I-effect', 'O', 'B-drug', 'B-effect']

In [16]:
data_tags

['I-drug', 'I-effect', 'O', 'B-drug', 'B-effect']

In [17]:
tag_values = data_tags
tag_values.append("PAD")
tag2idx = {t:i for i,t in enumerate(tag_values)}

In [18]:
tag2idx

{'I-drug': 0, 'I-effect': 1, 'O': 2, 'B-drug': 3, 'B-effect': 4, 'PAD': 5}

In [19]:
def tokenize_and_allign_labels(sentences,labels):
    tokenized_sentences=[]
    alligned_labels=[]
    for word,label in zip(sentences,labels):
        tokenized_word = tokenizer.tokenize(word)
        subwords = len(tokenized_word)

        tokenized_sentences.extend(tokenized_word)
        
        if subwords>1 and label!='O':
            c=label.replace('B','I')
            alligned_labels.extend([label]+ [c]*(subwords-1))
        else:
            alligned_labels.extend([label]*subwords)

    return tokenized_sentences,alligned_labels

In [20]:
tokenized_data = [tokenize_and_allign_labels(sent,label) for sent,label in zip(sentences,labels)]

In [21]:
tokenized_texts = [sentandlabel[0] for sentandlabel in tokenized_data]
new_labels = [sentandlabel[1] for sentandlabel in tokenized_data]

In [22]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in tokenized_texts],maxlen=130,dtype="long",
                          value=0.0,truncating="post",padding="post")
label_tags = pad_sequences([[tag2idx.get(l) for l in label] for label in new_labels],maxlen=130,value=tag2idx["PAD"],padding="post",dtype="long",truncating="post")

In [23]:
attention_masks=[[float(i!=0.0) for i in ii] for ii in input_ids]

In [25]:
train_inputs=torch.tensor(input_ids,dtype=torch.long)
train_labels=torch.tensor(label_tags,dtype=torch.long)
train_mask=torch.tensor(attention_masks,dtype=torch.long)

In [26]:
def ret_dataloader():
    batch_size = optim_batch_size
    train_dataset = TensorDataset(train_inputs,train_mask,train_labels)
    train_dataloader = DataLoader(train_dataset,sampler=RandomSampler(train_dataset),batch_size=batch_size)

    return train_dataloader

In [27]:
def ret_model():

    configuration = AutoConfig.from_pretrained(pretrained_bert)
    configuration.hidden_dropout_prob = optim_dropout
    configuration.attention_probs_dropout_prob = optim_dropout
    configuration.num_labels = len(tag2idx)
    configuration.output_attentions = False
    configuration.output_hidden_states = False

    model = AutoModelForTokenClassification.from_pretrained(pretrained_bert,config=configuration)
    return model

In [28]:
def ret_optim(model):
    print(f"learning_rate = {optim_learning_rate}")
    optimizer = AdamW(model.parameters(),
                      lr = optim_learning_rate,
                      eps = 1e-8,weight_decay = optim_weight_decay)
    return optimizer

In [29]:
def ret_scheduler(optimizer, dataloader_train):
    epochs = optim_epochs
    total_steps = len(dataloader_train) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)
    return scheduler

In [30]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [31]:
import time
def train(epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model=ret_model()
    model.to(device)
    dataloader_train = ret_dataloader() #, dataloader_validation
    optimizer = ret_optim(model)
    scheduler = ret_scheduler(optimizer, dataloader_train)
    epochs = epochs
    loss_values, val_loss = [], []
    t0= time.time()
    for epoch_i in range(epochs):
        total_loss = 0
        model.train()
        for step, batch in enumerate(dataloader_train):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        avg_train_loss = total_loss / len(dataloader_train)
        print("Average training loss: {0:.2f}".format(avg_train_loss))
        print("")
        if epoch_i+1 == epochs:
            torch.save(model.state_dict(), f'ner_{dataset_name}_{pretrained_save}_{epoch_i+1}.model')
        training_time = format_time(time.time()-t0)
    model.load_state_dict(torch.load(f'ner_{dataset_name}_{pretrained_save}_{epochs}.model', map_location=torch.device('cuda')))

    return model,training_time

In [32]:
model,training_time=train(optim_epochs)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


learning_rate = 5e-06


  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Average training loss: 0.21

Average training loss: 0.04

Average training loss: 0.03

Average training loss: 0.03

Average training loss: 0.02

Average training loss: 0.02

Average training loss: 0.02

Average training loss: 0.02



In [33]:
version =2.0
model.push_to_hub(f"ner_{dataset_name}_{pretrained_save}_{version}",token="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")
tokenizer.push_to_hub(f"ner_{dataset_name}_{pretrained_save}_{version}",token="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")

pytorch_model.bin: 100%|██████████| 431M/431M [00:38<00:00, 11.2MB/s] 


CommitInfo(commit_url='https://huggingface.co/collij22/ner_ADE_corpus_output3_michiyasunaga_BioLinkBERT-base_2.0/commit/aa0a91346fb200d4dd88d359e1fc51dfabecfb82', commit_message='Upload tokenizer', commit_description='', oid='aa0a91346fb200d4dd88d359e1fc51dfabecfb82', pr_url=None, pr_revision=None, pr_num=None)

In [34]:
tag2idx

{'I-drug': 0, 'I-effect': 1, 'O': 2, 'B-drug': 3, 'B-effect': 4, 'PAD': 5}

In [37]:
test_sentences = test['sentences'].to_list()
test_labels = test['labels'].to_list()

In [38]:
def ret_testdataloader(test_sentences,test_labels):
    tokenized_data = [tokenize_and_allign_labels(sent,label) for sent,label in zip(test_sentences,test_labels)]
    tokenized_texts = [sentandlabel[0] for sentandlabel in tokenized_data]
    new_labels = [sentandlabel[1] for sentandlabel in tokenized_data]
    test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in tokenized_texts],maxlen=130,dtype="long",
                          value=0.0,truncating="post",padding="post")
    test_label_tags = pad_sequences([[tag2idx.get(l) for l in label] for label in new_labels],maxlen=130,value=tag2idx["PAD"],padding="post",dtype="long",truncating="post")
    test_attention_masks=[[float(i!=0.0) for i in ii] for ii in test_input_ids]
    test_inputs = torch.tensor(test_input_ids,dtype=torch.long)
    test_labels_id=torch.tensor(test_label_tags,dtype=torch.long)
    test_mask=torch.tensor(test_attention_masks,dtype=torch.long)
    batch_size = optim_batch_size
    test_dataset = TensorDataset(test_inputs,test_mask,test_labels_id)
    test_dataloader = DataLoader(test_dataset,sampler=RandomSampler(test_dataset),batch_size=batch_size)
    return test_dataloader


In [39]:
def get_results():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    test_dataloader = ret_testdataloader(test_sentences,test_labels)
    model.eval()
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_loss = 0
    predictions , true_labels = [], []
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        #print(predictions)
        true_labels.extend(label_ids)
        eval_loss += outputs[0].mean().item()
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    print("Test loss: {0:.2f}".format(eval_loss))
    prediction_tags = []
    validation_tags = []
    for item1,item2  in zip(predictions, true_labels):
        p1,v1= [],[]
        for i in range(len(item2)):
            if tag_values[item2[i]] != "PAD":
                p1.append(tag_values[item1[i]])
                v1.append(tag_values[item2[i]])
        prediction_tags.append(p1)
        validation_tags.append(v1)
    acc_score = accuracy_score(prediction_tags, validation_tags)
    p_score = precision_score(prediction_tags, validation_tags)
    r_score = recall_score(prediction_tags, validation_tags)
    f1_s = f1_score(prediction_tags, validation_tags)
    report = classification_report(prediction_tags, validation_tags)
    #conf = confusion_matrix(prediction_tags, validation_tags)
    conf = performance_measure(validation_tags,prediction_tags)
    print("Test accuracy_score: {0:.2f}".format(acc_score))
    print("Test precision_score: {0:.2f}".format(p_score))
    print("Test recall_score: {0:.2f}".format(r_score))
    print("Test f1_score: {0:.2f}".format(f1_s))
    print("Classification Report")
    print((report))
    return eval_loss,acc_score,p_score,r_score,f1_s,report,conf

In [41]:
eval_loss,acc_score,p_score,r_score,f1_s,report,conf=get_results()

Test loss: 0.03
Test accuracy_score: 0.95
Test precision_score: 0.89
Test recall_score: 0.81
Test f1_score: 0.85
Classification Report
              precision    recall  f1-score   support

          AD       0.00      0.00      0.00         3
        drug       0.93      0.89      0.91       818
      effect       0.85      0.75      0.80       973

   micro avg       0.89      0.81      0.85      1794
   macro avg       0.59      0.55      0.57      1794
weighted avg       0.88      0.81      0.85      1794



  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
#first time
columns=["model_name","Training_time","precision","recall","f1_Score","True Positives","False Positives","True Negatives","False Negatives","Epochs","Learning Rate","Drop Out","Batch Size","Weight Decay"]
# df = pd.DataFrame(columns=columns)
# df.to_csv("results.csv",index=False)

In [44]:
def add_row():
  global eval_loss,acc_score,p_score,r_score,f1_s,report,conf
  df=pd.read_csv("results.csv")
  data=[model_name,training_time,p_score,r_score,f1_s,conf["TP"],conf["FP"],conf["TN"],conf["FN"],optim_epochs,optim_learning_rate,optim_dropout,optim_batch_size,optim_weight_decay]
  df.loc[len(df)] = data
  df.to_csv("results.csv",index=False)

In [45]:
add_row()