## Installing the required libraries

In [None]:
!pip3 install transformers pandas numpy datasets --quiet

## IMporting all required libraries

In [45]:
import torch
from torch.utils.data import DataLoader,Dataset
import pandas as pd
from transformers import BertTokenizer
from tqdm.auto import tqdm
import pandas as pd
from collections import defaultdict
from itertools import permutations
import torch
from transformers import AdamW
from transformers import get_scheduler
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import BertModel
from tqdm.auto import tqdm
from datasets import load_metric
import itertools


## Tokenizer

We are preparing the tokenizer before working on the dataloader, we are using a max length of 256 because its an optimimal length for this dataset, Other than that I am using the tokenizer for bert-base-uncased.
I am also adding the extra tokens, those will be used to define the start and end of first and second entities.

In [46]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',padding='max_length',max_length=256,return_offsets_mapping=True)
tokenizer.add_tokens(['[e1]', '[/e1]', '[e2]', '[/e2]'])
print(tokenizer('[e1]A[/e1] and [e2]B[/e2] had a fight'))



{'input_ids': [101, 30522, 1037, 30523, 1998, 30524, 1038, 30525, 2018, 1037, 2954, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Class labels

I am reading the class file and placing it into a dictionary, and I am adding the Other class which will the relation for entity pairs not provided in the input data.

In [47]:
cl2int ={}
relation = 'nyt29/relations.txt'

with open(relation,'r') as rfile:
  for c,row in enumerate(rfile.readlines()):
    cl2int[row.replace("\n","")]=c

cl2int["other"]=len(cl2int)
print(cl2int)
num_labels = len(cl2int)
print(num_labels)

{'/location/administrative_division/country': 0, '/location/country/capital': 1, '/location/country/administrative_divisions': 2, '/location/neighborhood/neighborhood_of': 3, '/location/location/contains': 4, '/people/person/nationality': 5, '/people/person/place_lived': 6, '/people/deceased_person/place_of_death': 7, '/business/person/company': 8, '/location/us_state/capital': 9, '/people/person/place_of_birth': 10, '/people/person/children': 11, '/business/company/founders': 12, '/business/company/place_founded': 13, '/sports/sports_team/location': 14, '/people/person/ethnicity': 15, '/people/ethnicity/geographic_distribution': 16, '/people/person/religion': 17, '/business/company/major_shareholders': 18, '/location/province/capital': 19, '/location/br_state/capital': 20, '/business/company/advisors': 21, '/film/film_location/featured_in_films': 22, '/film/film/featured_film_locations': 23, '/location/us_county/county_seat': 24, '/time/event/locations': 25, '/people/deceased_person/p

## Dataloader

This is my dataloader for this dataset, it takes input of the sentences, the tuple file and the pointer file. it also takes a boolean variable negative to decide whether all entity pairs are to considered (negative sampling). Next for each senteance we place the entity markers in the sentence and then tokenize that sentence. for the labels I am using one hot encoding and a vector of length 30 with 1s for eeach positive class. Finally we include the positions of the entity start markers in the input which will be useful for the model to retreive positions.

In [48]:


class REloader(Dataset):

  def __init__(self,sentences,tuples,pointer,negative):
    self.negative = negative
    self.sent_rows = self.get_rows(sentences)
    self.tup_rows = self.get_rows(tuples)
    self.point_rows = self.get_rows(pointer)
    self.em_data = defaultdict(list)
    self.tokenized_data = []
    self.prepare_data()
    print(len(self.em_data))
    self.tokenize()
    

  def get_rows(self,file_path):
    output =[]
    with open(file_path,'r') as cfile:
      for row in cfile.readlines():
        # print(len(row))
        output.append(row)
    return output
  
  def prepare_data(self):
    id=0
    for c,r in enumerate(self.point_rows):

      rel_tuples = r.split(" | ")
      ents=set()
      rels=defaultdict(list)
      for rel in rel_tuples:
        e1_s,e1_e,e2_s,e2_e,rel = rel.split(" ")
        ents.add((e1_s,e1_e))
        ents.add((e2_s,e2_e))
        rel = rel.replace("\n","")
        rels[((e1_s,e1_e),(e2_s,e2_e))].append(rel)

      for (e1_start,e1_end),(e2_start,e2_end) in permutations(list(ents), 2):
        sent = self.sent_rows[c].lower().split(" ")
        sent[int(e1_start)]='[e1]'+sent[int(e1_start)]
        sent[int(e1_end)]=sent[int(e1_end)]+'[/e1]'
        sent[int(e2_start)]='[e2]'+sent[int(e2_start)]
        sent[int(e2_end)]=sent[int(e2_end)]+'[/e2]'
        format_sentance =' '.join(sent) 

        if int(e1_start)<int(e2_start):
          entities=[int(e1_start)+1,int(e2_start)+3]
        else:
          entities = [int(e1_start)+3,int(e2_start)+1]
        # if ((e1_start,e1_end),(e2_start,e2_end)) in rels:
        
        rel_label = rels[((e1_start,e1_end),(e2_start,e2_end))]
        

        if len(rel_label)<=0:
            if self.negative:
                rel_label.append("other")
            else:
                continue
        
        self.em_data[(format_sentance,*entities)].extend([cl2int[i] for i in rel_label])
     

  def tokenize(self):
    progress = tqdm(range(len(self.em_data)))
    for k,v in self.em_data.items():

      sent  = tokenizer(k[0],padding = 'max_length',truncation=True)
      e_1,e_2 = None,None
      for i,t in enumerate(tokenizer.convert_ids_to_tokens(sent['input_ids'])):
        if t =='[e1]':
          e_1 = i
        if t=='[e2]':
          e_2 =i
      assert e_1 is not None and e_2 is not None and e_1!=e_2

      entities = [e_1,e_2]
      labels = torch.zeros(num_labels)
      labels[v]=1

      output ={**sent,'entities':entities,'labels':labels}
      output ={k:torch.tensor(v) for k,v in output.items()}

      # output = {**{k:torch.tensor(v) for k,v in tokenizer(row[0],padding='max_length',truncation=True).items()},'labels':row[1]}
      self.tokenized_data.append(output)
      progress.update(1)
    self.len = len(self.tokenized_data)
    
  
  def __len__(self):
    return self.len

  def __getitem__(self,idx):
    return self.tokenized_data[idx]


# train_loader = REloader('nyt29/test.sent','nyt29/test.tup','nyt29/test.pointer',True)
# print(len(train_loader))



In [None]:
# tokenizer.convert_ids_to_tokens(train_loader[0]['input_ids'])

Creating the train_loader and train_dataloader with negative sampling True and Batch size 8

In [39]:
train_loader = REloader('nyt29/train.sent','nyt29/train.tup','nyt29/train.pointer',True)
train_dataloader = DataLoader(train_loader,shuffle=True,batch_size=8)

146588


  0%|          | 0/146588 [00:00<?, ?it/s]



## Model 

This is the code for the model, we are taking input of the new resized vocubulary because we have added extra special tokens. Other than that We are intializing our bert model, a dense layer and a dropout layer and BCE loss that also performs sigmoid. In the forward pass I am getting the encoded representation and then slicing that using the entity marker positions and concatentaing them to classify that tensor.

In [12]:


class BertForRE(nn.Module):
  def __init__(self,name,resize_len,num_labels):
    super(BertForRE, self).__init__()
    # self.num_labels = num_labels
    self.model = BertModel.from_pretrained(name)
    self.model.resize_token_embeddings(resize_len)
    self.dense = nn.Linear(self.model.config.hidden_size*2,num_labels)
    self.dropout = nn.Dropout(self.model.config.hidden_dropout_prob)
    self.loss_fn = nn.BCEWithLogitsLoss(reduction='mean')

  def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        entities=None):
      
      outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
      
      # print(outputs)
      seq_outputs = outputs[0]
      # print(seq_outputs.shape)
      # print(seq_outputs[:,entities[:,1],:].shape)
      # print(entities)
      batch_size = seq_outputs.shape[0]
      relation = torch.cat((seq_outputs[range(batch_size),entities[:,0],:],seq_outputs[range(batch_size),entities[:,1],:]),1)
      # print(relation.shape)
      # relation = relation.squeeze(1)
      relation = self.dropout(relation)
      logits = self.dense(relation)
      # print(logits.shape)
      loss = self.loss_fn(logits,labels.float())
      return loss,logits


A test run to check if the model inputs and outputs are working as expected.

In [13]:
test_bert = BertForRE('bert-base-uncased',len(tokenizer),len(cl2int.keys()))
# test_bert = BertModel.from_pretrained('bert-base-uncased')
# test_bert.resize_token_embeddings(len(tokenizer))
inputs = tokenizer("a1 zoning laws slow growth the 20-year building boom on [e2]staten island[/e2] , long [e1]new york city[/e1] 's fastest-growing borough , is decelerating dramatically , thanks to new zoning rules .",truncation=True)
print(len(inputs['input_ids']))
# inputs['labels']=[0]
print(inputs.keys())
inputs['entities'] = [14,10]
inputs['labels'] = [0]*30
inputs={k:torch.unsqueeze(torch.tensor(v),0) for k,v in inputs.items()}
out  = test_bert(**inputs)
print(out)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


44
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
(tensor(0.6897, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), tensor([[ 0.0308,  0.3767, -0.2581, -0.1187,  0.5892,  0.3806,  0.0480,  0.6230,
         -0.8996, -0.8679, -0.1273, -0.3949, -0.0510,  0.3322, -0.5333,  0.3768,
         -0.0597,  0.3504, -0.3920, -0.4329,  0.3610, -0.4087,  0.0178, -0.4883,
         -0.1641,  0.3167, -0.1875, -0.1689,  0.0999,  0.2953]],
       grad_fn=<AddmmBackward>))


## training parameters

we are defining our optimizewr, scheduler and batch size

In [61]:

optimizer = AdamW(test_bert.parameters(),lr=5e-5)
num_epochs = 1
batch_size = 8
num_training_steps= num_epochs*(len(train_dataloader))
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps

)
print(num_training_steps,num_epochs,batch_size)

18324 1 8


Shifting our model to GPU

In [62]:

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
test_bert.to(device)
print(device)

cuda


## Training Loop 

This is where i am training the model, i am printing the accuracies and losses. it should be kept in mind that accuracies are only representative and actual f1 scores will be calculated below.
knowing that accuracy_1 is the accuracy of matching the 1s in the lables and accuracy_2 is matching both the 1s and 0s. in the output of training most of hte output got cut out due to length so it only shows 4300~ steps.

In [None]:



# test_bert.train()
# metric = load_metric('f1')
step=0
running_corrects_1 = 0
running_corrects_2 = 0
actual_corrects_1 = 0
logs = {}
losses = []
accs =[]
accs_2 =[]
progress_bar = tqdm(range(num_training_steps))


for _ in range(num_epochs):
  test_bert.train()
  for batch in train_dataloader:
      step+=1
      # print(step," of ",num_training_steps)
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = test_bert(**batch)
      loss,logits = outputs
      loss.backward()
      # print(loss)
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      logits = torch.sigmoid(logits)
      logits[logits>0.5]=1
      running_corrects_1 += torch.sum(logits == batch['labels'])
      actual_corrects_1 += torch.sum(batch['labels'])
      accuracy_1 = running_corrects_1.item()/actual_corrects_1.item()
      
      logits[logits<=0.5]=0
      running_corrects_2 += torch.sum(logits == batch['labels'])
      accuracy_2 = running_corrects_2.item()/(step*batch_size*num_labels)
      # actual_corrects_1 += torch.sum(batch['labels'])
      
      logs['step']=step
      logs['log loss'] = loss.item()
      logs['accuracy'] = accuracy_1
      logs['accuracy_2']=accuracy_2

      losses.append(loss.item())
      accs.append(accuracy_1)
      accs_2.append(accuracy_2)
      print(logs)
      progress_bar.update(1)
      if step%1000==0:
        torch.save(test_bert.state_dict(),f'bertForRE/saved_3_{step}.pt')
  # bert_my.eval()
  # print("performing eval...>")
  # for ebatch in eval_dataloaderx:
  #     batch = {k: v.to(device) for k, v in ebatch.items()}
  #     with torch.no_grad():
  #         outputs = bert_my(**batch)

  #     _,logits = outputs
  #     predictions = torch.argmax(logits, dim=-1)
  #     metric.add_batch(predictions=predictions, references=batch["labels"])
      

  # print(metric.compute())
print("training complete")
torch.save(test_bert.state_dict(),'bertForRE/saved_3.pt')

  0%|          | 0/18324 [00:00<?, ?it/s]

{'step': 1, 'log loss': 0.6799113154411316, 'accuracy': 0.25, 'accuracy_2': 0.5916666666666667}
{'step': 2, 'log loss': 0.6128765940666199, 'accuracy': 0.4117647058823529, 'accuracy_2': 0.675}
{'step': 3, 'log loss': 0.5788697004318237, 'accuracy': 0.28, 'accuracy_2': 0.7152777777777778}
{'step': 4, 'log loss': 0.5065397024154663, 'accuracy': 0.24242424242424243, 'accuracy_2': 0.7677083333333333}
{'step': 5, 'log loss': 0.44227609038352966, 'accuracy': 0.19047619047619047, 'accuracy_2': 0.8033333333333333}
{'step': 6, 'log loss': 0.3879002332687378, 'accuracy': 0.2, 'accuracy_2': 0.8319444444444445}
{'step': 7, 'log loss': 0.3433133661746979, 'accuracy': 0.20689655172413793, 'accuracy_2': 0.8523809523809524}
{'step': 8, 'log loss': 0.3206348717212677, 'accuracy': 0.21212121212121213, 'accuracy_2': 0.8677083333333333}
{'step': 9, 'log loss': 0.3006373941898346, 'accuracy': 0.2, 'accuracy_2': 0.8777777777777778}
{'step': 10, 'log loss': 0.2554560601711273, 'accuracy': 0.21686746987951808

## Evaluation Blocks

We are loading the Test and Validation splits, then running the model on them and calculating the f1 score. While evaluation negative sampling has been disabled ( all entity pairs are not considered ) the f1 score will be calculated on the provided data only. Also while calculating the score we are using only the first 29 classes that is the last class "Other" will not be considered.

In [9]:
test_loader = REloader('nyt29/test.sent','nyt29/test.tup','nyt29/test.pointer',False)
test_dataloader = DataLoader(test_loader,shuffle=True,batch_size=8)

9412


  0%|          | 0/9412 [00:00<?, ?it/s]



In [54]:
val_loader = REloader('nyt29/dev.sent','nyt29/dev.tup','nyt29/dev.pointer',False)
val_dataloader = DataLoader(val_loader,shuffle=True,batch_size=8)

8138


  0%|          | 0/8138 [00:00<?, ?it/s]



In [58]:
train_loader_2 = REloader('nyt29/train.sent','nyt29/train.tup','nyt29/train.pointer',False)
train_dataloader_2 = DataLoader(train_loader_2,shuffle=True,batch_size=8)

73392


  0%|          | 0/73392 [00:00<?, ?it/s]



Loading the model from memory 

In [50]:
import torch
import numpy as np 
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
my_trained = BertForRE('bert-base-uncased',len(tokenizer),len(cl2int.keys()))
my_trained.load_state_dict(torch.load('bertForRE/best_model.pt'))
my_trained.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForRE(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (d

In [49]:
def evaluate(X,Y):
    X = [i.cpu().detach().numpy() for i in X]
    X = np.concatenate(X)
    Y = [i.cpu().detach().numpy() for i in Y]
    Y = np.concatenate(Y)
    f1 = f1_score(Y, X, labels=list(range(29)),average='micro')
    return f1

In [51]:
from datasets import load_metric
my_trained.eval()

progress_bar = tqdm(range(len(test_dataloader)))
print("Running Model on TEST ...>")
step = 0 
X=[]
Y=[]
for ebatch in test_dataloader:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    logits = torch.sigmoid(logits)
    
    # label = batch['label']
    logits[logits>0.5]=1
    logits[logits<=0.5]=0
        
    X.append(logits)
    Y.append(batch['labels'])
    progress_bar.update(1)


  0%|          | 0/612 [00:00<?, ?it/s]

Running Model on TEST ...>


In [52]:
print(f"f1 score recieved on test {evaluate(X,Y)}")

f1 score recieved on test 0.902086383601757


In [55]:
my_trained.eval()

progress_bar = tqdm(range(len(val_dataloader)))
print("performing eval on VAL split...>")
step = 0 
X_val=[]
Y_val=[]
for ebatch in val_dataloader:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    logits = torch.sigmoid(logits)
    
    # label = batch['label']
    logits[logits>0.5]=1
    logits[logits<=0.5]=0
        
    X_val.append(logits)
    Y_val.append(batch['labels'])
    progress_bar.update(1)


  0%|          | 0/1018 [00:00<?, ?it/s]

performing eval on VAL split...>


In [57]:
print(f"f1 score recieved on validation {evaluate(X_val,Y_val)}")

f1 score recieved on validation 0.9641420526376279


In [59]:
my_trained.eval()

progress_bar = tqdm(range(len(train_dataloader_2)))
print("performing eval on TRAIN split...>")
step = 0 
X_train=[]
Y_train=[]
for ebatch in train_dataloader_2:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    logits = torch.sigmoid(logits)
    
    # label = batch['label']
    logits[logits>0.5]=1
    logits[logits<=0.5]=0
        
    X_train.append(logits)
    Y_train.append(batch['labels'])
    progress_bar.update(1)
# print("results on train split")

  0%|          | 0/9174 [00:00<?, ?it/s]

performing eval on TRAIN split...>


In [60]:
print(f"f1 score recieved on train {evaluate(X_train,Y_train)}")

f1 score recieved on train 0.975470498474059
