In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification,BertModel, AdamW, get_cosine_schedule_with_warmup
import ast
from seqeval.metrics import classification_report
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from google.colab import drive
import datasets
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter

In [None]:
# %load_ext tensorboard
# drive.mount('/content/drive')
data=pd.read_csv("/content/drive/MyDrive/google_upload/revised_tags_2.csv")
# data=data[["tokens","ner_tokens"]].drop_duplicates()
data["tokens"]=data["tokens"].apply(lambda x:ast.literal_eval(x))
data["ner_tokens"]=data["ner_tokens"].apply(lambda x:ast.literal_eval(x))
id2label={0: 'O',
        1: 'B-method',
        2: 'I-method',
        3: 'B-material',
        4: 'I-material',
        5: 'B-product',
        6: 'I-product',
        7: 'B-Faradaicefficiency',
        8: 'I-Faradaicefficiency'
        }

label2id={v:k for k,v in id2label.items()}


# Dataset Loader

max_len=110
model_name="allenai/scibert_scivocab_uncased"
tokenizer=BertTokenizerFast.from_pretrained(model_name)

class dataset(Dataset):
  def __init__(self,df,tokenizer,max_len):
    self.df=df
    self.len=len(df)
    self.max_len=max_len
    self.tokens=df["tokens"]
    self.ner_tokens=df["ner_tokens"]
    self.tokenizer=tokenizer
  def __getitem__(self,index):
    tokens=self.tokens.iloc[index]
    ner_tokens=self.ner_tokens.iloc[index]
    tokenized_data=self.tokenizer(tokens,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding="max_length",
                                  truncation=True,
                                  max_length=self.max_len)
    encoded_labels=np.ones(len(tokenized_data["offset_mapping"]),dtype=int)*-100
    i=0
    for idx,mapping in enumerate(tokenized_data["offset_mapping"]):
      if mapping[0]==0 and mapping[1]!=0:
        encoded_labels[idx]=ner_tokens[i]
        i+=1
    items={k:torch.as_tensor(v) for k,v in tokenized_data.items()}
    items["labels"]=torch.as_tensor(encoded_labels)
    return items
  def __len__(self):
      return self.len

train_size = 0.8
X=data[["tokens"]]
y=data[["ner_tokens"]]
from sklearn.model_selection import train_test_split
X_train, X_vt, y_train, y_vt =train_test_split(X,y,train_size=train_size,random_state=42)
X_test,X_valid,y_test,y_valid=train_test_split(X_vt,y_vt,train_size=0.5,random_state=42)
def rejoint(x,y):
  df=pd.concat([x,y],axis=1)
  return df
train_dataset = rejoint(X_train,y_train)
test_dataset = rejoint(X_test,y_test)
val_dataset = rejoint(X_valid,y_valid)
train_data=dataset(train_dataset,tokenizer,max_len)
val_data=dataset(val_dataset,tokenizer,max_len)
test_data=dataset(test_dataset,tokenizer,max_len)


from sklearn.utils.class_weight import compute_class_weight

full_data=pd.concat([train_dataset,val_dataset,test_dataset])

all_tokens=[]
full_data["ner_tokens"].apply(lambda x:all_tokens.extend(x))
unique_labels=np.unique(all_tokens)
class_weights=compute_class_weight(class_weight="balanced",classes=unique_labels,y=all_tokens)
class_weights=torch.tensor(class_weights,dtype=torch.float)


In [None]:

# BERT Base
## Parameters
TRAIN_BATCH_SIZE=24
TEST_BATCH_SIZE=12
VALID_BATCH_SIZE=12

## Training Control
EPOCHS=4
LEARNING_RATE=0.8e-04
MAX_GRAD_NORM=5
WARMUP=0.1

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
val_params =  {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
training_loader=DataLoader(train_data,**train_params)
test_loader=DataLoader(test_data,**test_params)
val_loader=DataLoader(val_data,**val_params)


In [None]:
class BNER(nn.Module):
  def __init__(self,model_name,num_labels,device,class_weights=None): #Initialising model layers
    super(BNER,self).__init__()
    self.device=device
    self.bert=BertModel.from_pretrained(model_name,config=BertConfig.from_pretrained(model_name,num_labels=num_labels)).to(device)
    self.num_labels=num_labels
    self.dropout=nn.Dropout(0.3)
    self.classifier=nn.Linear(768,num_labels,device=device) #Inputs BERT encodings and outputs classification logits
    if class_weights==None: # Implementing class weights if desired
      self.loss_func=nn.CrossEntropyLoss(ignore_index=-100)
    else:
      self.loss_func=nn.CrossEntropyLoss(ignore_index=-100,weight=class_weights)

  def forward(self,input_ids,attention_mask=None,labels=None):
    hidden_state_output=self.bert(input_ids,attention_mask=attention_mask)#Bert models output batch size x max_sequence_length x 768
    hidden_state=hidden_state_output[0]
    sequence_outputs=self.dropout(hidden_state)
    logits=self.classifier(sequence_outputs.view(-1,768)) #Reduces encodings to dimension 9
    output={"logits":logits}
    if labels != None:
      flattened_labels=labels.view(-1)
      loss=self.loss_func(logits,flattened_labels)
      output["loss"]=loss
      return output
## Training Requirements
TRAIN_STEPS=EPOCHS*len(training_loader)
VAL_STEPS=EPOCHS*len(val_loader)
TEST_STEPS=EPOCHS*len(test_loader)
num_warmup_steps=int(WARMUP*TRAIN_STEPS)

device='cuda' if torch.cuda.is_available() else 'cpu'
class_weights=class_weights.to(device)
num_labels=len(label2id.keys())
model=BNER(model_name=model_name,num_labels=num_labels,device=device,class_weights=class_weights)
model.to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=LEARNING_RATE)
scheduler=get_cosine_schedule_with_warmup(optimizer,num_warmup_steps,TRAIN_STEPS)

# Validating model function before training

inputs = train_data[3]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
outputs["loss"]



In [None]:

### Training and Evaluation Functions

from datasets import load_metric
from sklearn.metrics import accuracy_score
metric = load_metric("seqeval")
def eval(model,eval_loader,return_output=True):
    # prog_bar=tqdm(total=len(eval_loader),desc="eval_steps",units="steps")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    with torch.no_grad():
        for idx, batch in enumerate(eval_loader):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long) 
            labels = batch['labels'].to(device, dtype = torch.long)
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels) #Runs our model and outputs loss and logits
            loss=outputs["loss"]
            eval_logits=outputs["logits"]
            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape = (batch_size x seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape = (batch_size x seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape = (batch_size x seq_len,)
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            eval_labels.append(labels.tolist())
            eval_preds.append(predictions.tolist())
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    labels = [[id2label[id] for id in eval_set] for eval_set in eval_labels]
    predictions = [[id2label[id] for id in eval_set] for eval_set in eval_preds]
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss} Validation Accuracy: {eval_accuracy}")
    if return_output:
      return labels,predictions
def get_metrics(lab,pred,return_df=False,hide_display=False):

  results=metric.compute(predictions=pred,references=lab) # Runs performance calculations using the predictions and labels
  flattened_results = {
      "overall_precision": results["overall_precision"],
      "overall_recall": results["overall_recall"],
      "overall_f1": results["overall_f1"],
      "overall_accuracy": results["overall_accuracy"],
  }
  clasif={}
  for k in results.keys():
    if(k not in flattened_results.keys()):
      clasif[k]=results[k]
  entity_scores=pd.DataFrame(clasif) # Gets the entity level performance scores
  overall=pd.DataFrame(flattened_results,index=["Total"])
  if not hide_display:
    display(entity_scores,overall)
  if return_df:
    return {"overall_performance":overall,"entity_performance":entity_scores}




def train(epoch,log_steps=50,writer=None):

  prog_bar=tqdm(total=len(training_loader),desc=f"Epoch: {epoch+1} training_steps",unit="steps")
  tr_loss=0
  nb_tr_examples,nb_tr_steps=0,0
  tr_preds,tr_labels=[],[]
  model.train()

  for idx, batch in enumerate(training_loader):
    ids=batch["input_ids"].to(device,dtype=torch.long)
    attention=batch["attention_mask"].to(device,dtype=torch.long)
    labels=batch["labels"].to(device,dtype=torch.long)
    loss=model(input_ids=ids,attention_mask=attention,labels=labels)["loss"]
    tr_loss+=loss.item()
    nb_tr_steps +=1
    nb_tr_examples+=labels.size(0)
    if idx % log_steps == 0:
      loss_step=tr_loss/nb_tr_steps
      print(f"Training Loss at {nb_tr_steps} training steps:{loss_step}")

    optimizer.zero_grad()

    loss.backward()

    total_norm = 0.0
    for p in model.parameters():
      if p.grad is not None:
        param_norm = p.grad.data.norm(2)
        total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5

    if writer is not None:
      writer.add_scalar('Gradient Norm', total_norm, epoch * len(training_loader) + idx)


    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),max_norm=MAX_GRAD_NORM)
    optimizer.step()
    scheduler.step()
    prog_bar.update(1)
  epoch_loss=tr_loss/nb_tr_steps
  print(f"Training loss at {epoch+1} : {epoch_loss}")
  prog_bar.close()
  if writer is not None:
    writer.add_scalar('Epoch Loss', epoch_loss, epoch)

def plot_confusion_matrix(labels,predictions):
  true_labels_flattened=[item for sublist in labels for item in sublist]
  true_predictions_flattened=[item for sublist in predictions for item in sublist]
  true_labels_flattened=[label2id[id] for id in true_labels_flattened]
  true_predictions_flattened=[label2id[id] for id in true_predictions_flattened]

  cm=confusion_matrix(true_labels_flattened,true_predictions_flattened)

  cm_normalized=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
  fig, ax = plt.subplots()
  sns.heatmap(cm_normalized,annot=True,fmt='.2f',cmap='crest',xticklabels=list(label2id.keys()),yticklabels=list(label2id.keys()))
  plt.title('Normalized Confusion Matrix')
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  ax.set_xticklabels(ax.get_xticklabels(), rotation=45,fontdict={"fontsize":7})
  ax.set_yticklabels(ax.get_yticklabels(),fontdict={"fontsize":7})
  plt.show()



In [None]:

writer = SummaryWriter(log_dir="logs-BERT")

for i in range(EPOCHS):
  print(f"##### TRAINING EPOCH : {i+1} #####")
  train(i,log_steps=20,writer=writer)
  eval(model,val_loader,return_output=False)
labels,predictions=eval(model,test_loader)
get_metrics(labels,predictions)
plot_confusion_matrix(labels,predictions)


In [None]:
%reload_ext tensorboard
#removes the log folder
%tensorboard --logdir=logs-BERT