<a href="https://colab.research.google.com/github/ankitaiisc/EventFactuality/blob/master/Bert_Subjective_Annotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [5]:
%cd /content/gdrive/My\ Drive/Colab\ Notebooks/EventFactuality/Neural_Modelling

/content/gdrive/My Drive/Colab Notebooks/EventFactuality/Neural_Modelling


In [None]:
!pip install transformers

In [8]:
from transformers import BertModel
import time
import torch
import argparse
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

from typing import List, Mapping, Optional
Outputs = Mapping[str, List[torch.Tensor]]

In [None]:
def cross_entropy_with_probs(
    input: torch.Tensor,
    target: torch.Tensor,
    weight: Optional[torch.Tensor] = None,
    reduction: str = "mean",
) -> torch.Tensor:
    """Calculate cross-entropy loss when targets are probabilities (floats), not ints.
    PyTorch's F.cross_entropy() method requires integer labels; it does accept
    probabilistic labels. We can, however, simulate such functionality with a for loop,
    calculating the loss contributed by each class and accumulating the results.
    Libraries such as keras do not require this workaround, as methods like
    "categorical_crossentropy" accept float labels natively.
    Note that the method signature is intentionally very similar to F.cross_entropy()
    so that it can be used as a drop-in replacement when target labels are changed from
    from a 1D tensor of ints to a 2D tensor of probabilities.
    Parameters
    ----------
    input
        A [num_points, num_classes] tensor of logits
    target
        A [num_points, num_classes] tensor of probabilistic target labels
    weight
        An optional [num_classes] array of weights to multiply the loss by per class
    reduction
        One of "none", "mean", "sum", indicating whether to return one loss per data
        point, the mean loss, or the sum of losses
    Returns
    -------
    torch.Tensor
        The calculated loss
    Raises
    ------
    ValueError
        If an invalid reduction keyword is submitted
    """
    num_points, num_classes = input.shape
    # Note that t.new_zeros, t.new_full put tensor on same device as t
    cum_losses = input.new_zeros(num_points)
    for y in range(num_classes):
        target_temp = input.new_full((num_points,), y, dtype=torch.long)
        y_loss = F.cross_entropy(input, target_temp, reduction="none")
        if weight is not None:
            y_loss = y_loss * weight[y]
        cum_losses += target[:, y].float() * y_loss

    if reduction == "none":
        return cum_losses
    elif reduction == "mean":
        return cum_losses.mean()
    elif reduction == "sum":
        return cum_losses.sum()
    else:
        raise ValueError("Keyword 'reduction' must be one of ['none', 'mean', 'sum']")

In [30]:
class FactualityDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')
        #self.df = self.df[self.df.label!=3]
        self.df.dropna(inplace=True)
        self.df.reset_index(drop=True, inplace=True)
        
        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence_dialouge']
        label = self.df.loc[index, 'label']
        source = self.df.loc[index, 'source']
        event = self.df.loc[index, 'event']
        source_idx = self.df.loc[index, 'source_index_dialouge']
        event_idx = self.df.loc[index, 'event_index_dialouge']

        prob_labels = list(self.df.loc[index, ['positive', 'negative', 'uncommitted', 'not_applicable']])

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length
        
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
        
        source_idx = torch.tensor(source_idx)
        event_idx = torch.tensor(event_idx)

        return sentence, "@".join(tokens), source_idx, event_idx, tokens_ids_tensor, attn_mask, label, torch.tensor(prob_labels)

In [10]:
import torch
import torch.nn as nn
from transformers import BertModel

class Classifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(Classifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased', \
                                                    output_hidden_states = True)
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer_1 = nn.Linear(1536, 768)
        self.cls_layer_2 = nn.Linear(768, 4)
        
    def get_word_embeddings(self, tokenized_text, tokenized_embedding):
        '''
        average the sub-token embeddings to get word embeddings
        '''
        word_embeddings = []
        idx = 0
        while(idx<len(tokenized_text)):
            cur = tokenized_embedding[idx]
            h_idx = idx+1
            count = 1
            while((h_idx<len(tokenized_text)) and ('#' in tokenized_text[h_idx])):
                cur = cur + tokenized_embedding[h_idx]
                count+=1
                h_idx+=1
            cur = cur/count
            word_embeddings.append(cur)
            idx = idx+count
        word_embeddings = torch.stack(word_embeddings, dim=0)
        return word_embeddings

    def forward(self, seq, tokens, s_idx, e_idx, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks 
                          to be used to avoid contibution of PAD tokens
            -s_idx : tensor of shape [B] containing index of source in seq
            -e_idx : tensor of shape [B] containing index of event in seq
            -tokens: string containing textual tokens obtained from bert tokenizer.
                     format of string t1@t2@t3....@tn
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _, hidden_states = self.bert_layer(seq, attention_mask = attn_masks)
 
        #Obtaining the representation of [CLS] head
        #cls_rep = cont_reps[:, 0]

        #Obtaining token embeddings from last layer
        token_embeddings = torch.stack(hidden_states, dim=0)[-1]
        
        token_text = []
        for t in tokens:
            token_text.append(t.split('@'))
            
        word_embeddings = []
        for te, emb in zip(token_text , token_embeddings):
            word_embeddings.append(self.get_word_embeddings(te, emb))
            
        
        se_embeddings = []
        for b in range(len(word_embeddings)):
            word_emb = word_embeddings[b]
            source_emb = word_emb[s_idx[b]]
            event_emb = word_emb[e_idx[b]]
            emb = torch.cat((source_emb, event_emb))
            se_embeddings.append(emb)
        se_embeddings = torch.stack(se_embeddings, dim=0)
        
        if(se_embeddings.shape[1]!=1536):
            print(torch.stack(hidden_states, dim=0).shape, token_embeddings.shape)
        #print(se_embeddings.shape)
        
        h1 = self.cls_layer_1(se_embeddings)
        logits = self.cls_layer_2(h1)
        
        #Feeding cls_rep to the classifier layer
        #logits = self.cls_layer(cls_rep)

        return logits

In [11]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=1)
    _, preds = probs.max(1)
    acc_val = torch.eq(preds, labels.squeeze()).float().mean()
    return acc_val, preds

def evaluate(net, criterion, dataloader, args):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    preds = []
    true_labels = []
    prob_labels = []
    sentences = []
    with torch.no_grad():
        for sent, tokens, s_idx, e_idx, seq, attn_masks, labels, p_labels in dataloader:
            seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)
            s_idx, e_idx = s_idx.cuda(args.gpu), e_idx.cuda(args.gpu)
            
            logits = net(seq, tokens, s_idx, e_idx, attn_masks)
            
            mean_loss += cross_entropy_with_probs(logits.squeeze(-1), p_labels).item()
            acc, pred = get_accuracy_from_logits(logits, labels)
            mean_acc += acc
            preds.append(pred.detach().cpu().numpy())
            true_labels.append(labels.detach().cpu().numpy())
            prob_labels.append(p_labels.detach().cpu().numpy())
            sentences.append(sent)
            count += 1

    return mean_acc / count, mean_loss / count, preds, true_labels, prob_labels, sentences

In [37]:
def train(net, criterion, opti, train_loader, val_loader, args):

    for ep in range(args.max_eps):
        for it, (sent, tokens, s_idx, e_idx, seq, attn_masks, labels, p_labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            
            #Converting these to cuda tensors
            seq, attn_masks, labels, p_labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu), p_labels.cuda(args.gpu)
            s_idx, e_idx = s_idx.cuda(args.gpu), e_idx.cuda(args.gpu)
            
            #Obtaining the logits from the model
            #print('computing logits')
            logits = net(seq, tokens, s_idx, e_idx, attn_masks)
            #print('logits computed')
            #Computing loss
            loss = cross_entropy_with_probs(logits.squeeze(-1), p_labels)

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % args.print_every == 0:
                acc, pred = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))

In [13]:
parser = argparse.ArgumentParser('')
parser.add_argument('-gpu', type = int, default = 0)
parser.add_argument('-freeze_bert', action='store_true')
parser.add_argument('-maxlen', type = int, default= 128)
parser.add_argument('-batch_size', type = int, default= 32)
parser.add_argument('-lr', type = float, default = 2e-5)
parser.add_argument('-print_every', type = int, default= 100)
parser.add_argument('-max_eps', type = int, default= 5)
args = parser.parse_args('')

In [14]:
#Instantiating the classifier model
print("Building model! (This might take time if you are running this for first time)")
st = time.time()
net = Classifier(args.freeze_bert)
net.cuda(args.gpu) #Enable gpu support for the model
print("Done in {} seconds".format(time.time() - st))

Building model! (This might take time if you are running this for first time)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Done in 23.48561143875122 seconds


In [36]:
print("Creating criterion and optimizer objects")
st = time.time()
criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr = args.lr)
print("Done in {} seconds".format(time.time() - st))

Creating criterion and optimizer objects
Done in 0.0022614002227783203 seconds


In [31]:
#Creating dataloaders
print("Creating train and val dataloaders")
st = time.time()
train_set = FactualityDataset(filename = './FactualityData/processed_data/subjective_annotations/train_data_0.7_with_idx_dialouge.csv', maxlen = args.maxlen)
val_set = FactualityDataset(filename = './FactualityData/processed_data/subjective_annotations/val_data_0.7_with_idx_dialouge.csv', maxlen = args.maxlen)
train_loader = DataLoader(train_set, batch_size = args.batch_size, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = args.batch_size, num_workers = 5)
print("Done in {} seconds".format(time.time() - st))

Creating train and val dataloaders
Done in 0.8139009475708008 seconds


In [None]:
print("Let the training begin")
st = time.time()
train(net, criterion, opti, train_loader, val_loader, args)
print("Done in {} seconds".format(time.time() - st))

Let the training begin
Iteration 100 of epoch 1 complete. Loss : 0.21243607997894287 Accuracy : 0.9375
Iteration 100 of epoch 2 complete. Loss : 0.2064167559146881 Accuracy : 0.96875
Iteration 100 of epoch 3 complete. Loss : 0.15748175978660583 Accuracy : 1.0


In [None]:
mean_acc, mean_loss, preds, true_labels, prob_labels, sentences = evaluate(net, criterion, val_loader, args)

In [None]:
preds = np.concatenate(preds)
true_labels = np.concatenate(true_labels)
prob_labels = np.concatenate(prob_labels)
sentences = np.concatenate(sentences)

In [None]:
neg_ids = np.where(true_labels==1)[0]
mis_clf = np.where(preds[neg_ids]!=1)[0]

In [None]:
len(sentences[neg_ids][mis_clf])

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, classification_report, recall_score, f1_score

In [None]:
confusion_matrix(true_labels, preds)

In [None]:
print('Class Distribution in Orignal Dataset')
print(' Class 0: , Percentage = 17% Pos ,\n Class 1: , Percentage = 3% Neg,\n Class 2: , Percentage = 7% Uncommited, \n Class 3: , Percentage = 72% NA,')

In [None]:
print(classification_report(true_labels, preds))

In [None]:
precision_score(true_labels, preds, average='micro')

In [None]:
recall_score(true_labels, preds, average='micro')

In [None]:
f1_score(true_labels, preds, average='micro')

In [None]:
print('Class Distribution in Orignal Dataset')
print(' Class 0: , Percentage = 17%,\n Class 1: , Percentage = 3%,\n Class 2: , Percentage = 7%, \n Class 3: , Percentage = 72%,')

#############################