In [0]:
%pip install transformers==3.5.1

In [0]:
%pip install datasets==1.4.1

In [0]:
%pip install seqeval==1.2.2

In [0]:
import time
import numpy as np
import random
import pickle
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from datasets import load_metric
from transformers import BertForTokenClassification
seed_num=0
np.random.seed(seed_num)
random.seed(seed_num)
torch.manual_seed(seed_num)
torch.cuda.manual_seed(seed_num)
torch.cuda.manual_seed_all(seed_num)

In [0]:
#From the example GitHub Notebook
def compute_metrics(p,id2tag):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    print("\t\tORG Precision: ",results['_ORG']['precision'])
    print("\t\tORG Recall: ",results['_ORG']['recall'])
    print("\t\tORG F1: ",results['_ORG']['f1'])
    print("\t\tGRT Precision: ",results['_GRT']['precision'])
    print("\t\tGRT Recall: ",results['_GRT']['recall'])
    print("\t\tGRT F1: ",results['_GRT']['f1'])

In [0]:
#Evaluate the model on the train_monitor set
def eval_on_valid(model, train_monitor_loader,id2tag):
    #Accumulate the predictions here
    val_preds = np.zeros((0,512,5))
    #Accumulate the labels here
    val_lbls = np.zeros((0,512))
    #Accumulate the oss here
    val_loss = 0
    #Loop over minibatches
    for i_val, batch_val in enumerate(train_monitor_loader):
        #Get the max length in this batch and crop based on that
        seq_lens = batch_val['seq_len']
        max_len_for_batch = max(seq_lens.cpu().detach().numpy())
        #Get inputs and labels for that batch and crop
        input_ids_val = torch.tensor(batch_val['input_ids'][:,:max_len_for_batch].detach().numpy()).to(device)
        attention_mask_val = torch.tensor(batch_val['attention_mask'][:,:max_len_for_batch].detach().numpy()).to(device)
        labels_val = torch.tensor(batch_val['labels'][:,:max_len_for_batch].detach().numpy()).to(device)
        #Do a forward pass
        outputs_val = model(input_ids_val, attention_mask=attention_mask_val, labels=labels_val)
        #First index is the loss. Since the output loss is the mean over minibatch samples,
        #we multiply it with batch size. Later, we divide it by the number of samples
        val_loss += outputs_val[0].item()
        #Save the loss and labels
        these_preds = outputs_val[1].cpu().detach().numpy()
        these_labels= labels_val.cpu().detach().numpy()
        #Pad the predictions again
        new_preds = np.ones((len(input_ids_val),512,5)) * -100
        new_labels= np.ones((len(input_ids_val),512)) * -100
        new_preds[:,:max_len_for_batch,:] = these_preds
        new_labels[:,:max_len_for_batch] = these_labels
        #Store in array
        val_preds = np.concatenate([val_preds,new_preds],axis=0)
        val_lbls = np.concatenate([val_lbls,new_labels],axis=0)
    print("\tValidation Loss: ",val_loss/len(train_monitor_loader))
    p = (val_preds, val_lbls)
    print("\tValidation Results: ")
    compute_metrics(p,id2tag)

In [0]:
#Class for funding bodies dataset
class FB_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels,at_mask,seq_lens):
        self.encodings = encodings
        self.labels = labels
        self.at_mask = at_mask
        self.seq_lens = seq_lens

    def __getitem__(self, idx):
        item = dict()
        item['input_ids'] = torch.tensor(self.encodings[idx])
        item['attention_mask'] = torch.tensor(self.at_mask[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        item['seq_len'] =self.seq_lens[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [0]:
model = BertForTokenClassification.from_pretrained('/dbfs/mnt/els-nlp-experts1/data/Gizem/bert-base-cased-tapt3/checkpoint-1000', num_labels=5)

In [0]:
with open("/dbfs/mnt/els-nlp-experts1/data/Gizem/bert_datasets.pkl",'rb') as f:
    train_dataset=pickle.load(f)
    train_monitor_dataset=pickle.load(f)
id2tag= {0: 'I_GRT', 1: 'O', 2: 'B_GRT', 3: 'B_ORG', 4: 'I_ORG'}
metric = load_metric("seqeval")

In [0]:
import gc

In [0]:
#Pick the device
device = torch.device('cuda')

#Put model to device
model.to(device)

#Put model to training mode
model.train()

#Define training batch size
batch_size=8#increase this
num_epochs = 2

#Get training sample generator
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_monitor_loader = DataLoader(train_monitor_dataset, batch_size=batch_size, shuffle=True)

#Initialize optimizer
optim = torch.optim.AdamW(model.parameters(), lr=2e-5) 

#Determine how many steps each epoch will take
print("Steps per epoch: " ,len(train_loader))

scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 50, 
                                            num_training_steps = len(train_loader)*num_epochs)

minibatch_losses = []
#Every x step, print training loss
every_x_step=500

#Loop over epochs
for epoch in range(num_epochs):
    print("Epoch: ",epoch)
    #Loop over minibatches
    #Accumulate training statistics
    train_loss = 0
    train_preds = np.zeros((0,512,5))
    train_lbls = np.zeros((0,512))
    for i, batch in enumerate(train_loader):
        gc.collect()
        start = time.time()
        #reset gradients
        optim.zero_grad()
        #Get the max length in this batch and crop based on that
        seq_lens = batch['seq_len']
        max_len_for_batch = max(seq_lens.cpu().detach().numpy())
        #get inputs
        input_ids = torch.tensor(batch['input_ids'][:,:max_len_for_batch].detach().numpy()).to(device)
        attention_mask = torch.tensor(batch['attention_mask'][:,:max_len_for_batch].detach().numpy()).to(device)
        labels = torch.tensor(batch['labels'][:,:max_len_for_batch].detach().numpy()).to(device)
        #When we call a classification model with the labels argument, the first returned element is the Cross Entropy loss between the predictions and the passed labels. 
        #Calculate loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        #attention_mask.detach()
        del attention_mask
        #loss is reduced by mean (so it roughly corresponds to loss of one sample)
        #https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        #https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForTokenClassification.forward
        loss = outputs[0]
        #Print loss
        train_loss+=loss.item()
        #Second index is the predictions, store them
        these_preds = outputs[1].cpu().detach().numpy()
        these_labels= labels.cpu().detach().numpy()
        del outputs
        #labels.detach()
        del labels
        #Pad the predictions again
        new_preds = np.ones((len(input_ids),512,5)) * -100
        new_labels= np.ones((len(input_ids),512)) * -100
        #input_ids.detach()
        del input_ids
        new_preds[:,:max_len_for_batch,:] = these_preds
        new_labels[:,:max_len_for_batch] = these_labels
        #Save the labels
        train_lbls = np.concatenate([train_lbls,new_labels],axis=0)
        train_preds = np.concatenate([train_preds,new_preds],axis=0)
        #backpropagation
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        #update the parameters
        optim.step()
        scheduler.step()
        end = time.time()
        minibatch_losses.append(loss.item())
        torch.cuda.empty_cache()
        #Every x step, print validation scores
        if (i+1)%every_x_step==0:          
            #Print training loss for this minibatch
            print("\tStep ",i+1,"/",len(train_loader))
            print("\t\tBatch padding: ",max_len_for_batch)
            print("\t\tMinibatch training loss: ",loss.item())
            print("\t\tTime for this minibatch: ",end-start)
            #Check val loss
            model.eval()
            with torch.no_grad():
                #Evaluate the current model on the validation set
                eval_on_valid(model, train_monitor_loader,id2tag)
            model.train()
            #Save Model
            #with open("/dbfs/mnt/els-nlp-experts1/data/Gizem/bert_epoch_"+str(epoch)+"_step_"+str(i)+'.pt','wb') as f:
            #    torch.save(model, f)
       
    print("\tApproximate Training loss for this epoch: ",train_loss/len(train_loader))
    print("\tApproximate Training results: ")
    compute_metrics((train_preds, train_lbls),id2tag)
    #Check val loss
    model.eval()
    with torch.no_grad():
        #Evaluate the current model on the validation set
        eval_on_valid(model, train_monitor_loader,id2tag)
    model.train()
    with open("/dbfs/mnt/els-nlp-experts1/data/Gizem/bert_epoch_"+str(epoch)+'.pt','wb') as f:
        torch.save(model, f)


model.eval()