In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# BERT

In [None]:
import torch
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
SEED = 1111
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer)

30522

In [None]:
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
max_input_length = 512

def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence) 
    return tokens
def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length]
    return tokens

def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:256]
        return " ".join(sent)
    except:
        return sent

In [None]:
def get_sent1_token_type(sent):
    try:
        return [0]* len(sent)
    except:
        return []
#Get list of 1s
def get_sent2_token_type(sent):
    try:
        return [1]* len(sent)
    except:
        return []
#combine from lists
def combine_seq(seq):
    return " ".join(seq)
#combines from lists of int
def combine_mask(mask):
    mask = [str(m) for m in mask]
    return " ".join(mask)

##**Data Preparation**

In [None]:
#convert jsonl from DINO to csv/tsv
#df = pd.read_json('/content/drive/MyDrive/NLP_Dissertation/DINO/output/nc-nli-1-combined/sts-nc-nli-comb-dataset-pp.jsonl', lines=True)
##df.head()
#df = df.rename({"text_a":"sentence1","text_b":"sentence2"}, axis='columns')
#df['gold_label'] = np.where(df['label'] > 0.5, 'entailment', (np.where(df['label'] == 0.5, 'neutral', 'contradiction')))
##df = df.sort_values('index')
#df.to_csv('/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1/nc-nli-1.csv', index=False)

In [None]:
#split the dataset
##df_train, df_dev, df_test = np.split(df.sample(frac=1, random_state=27), [int(.7*len(df)), int(.85*len(df))])

In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1/nc-nli-1.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3906 entries, 0 to 3905
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sentence1   3906 non-null   object 
 1   sentence2   3906 non-null   object 
 2   label       3906 non-null   float64
 3   index       3906 non-null   int64  
 4   gold_label  3906 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 152.7+ KB


In [None]:
df['gold_label'].value_counts()

entailment       1329
neutral          1316
contradiction    1261
Name: gold_label, dtype: int64

In [None]:
df_train_set, df_test = train_test_split(df, test_size=0.15, random_state=27, stratify=df['gold_label'])
df_train, df_dev = train_test_split(df_train_set, test_size=0.15, random_state=27, stratify=df_train_set['gold_label'])

In [None]:
#Get neccesary columns
df_train = df_train[['gold_label','sentence1','sentence2']]
df_dev = df_dev[['gold_label','sentence1','sentence2']]
df_test = df_test[['gold_label','sentence1','sentence2']]

In [None]:
#Trim each sentence upto maximum length
df_train['sentence1'] = df_train['sentence1'].apply(trim_sentence)
df_train['sentence2'] = df_train['sentence2'].apply(trim_sentence)
df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)
df_test['sentence2'] = df_test['sentence2'].apply(trim_sentence)

#Add [CLS] and [SEP] tokens
df_train['sent1'] = '[CLS] ' + df_train['sentence1'] + ' [SEP] '
df_train['sent2'] = df_train['sentence2'] + ' [SEP]'
df_dev['sent1'] = '[CLS] ' + df_dev['sentence1'] + ' [SEP] '
df_dev['sent2'] = df_dev['sentence2'] + ' [SEP]'
df_test['sent1'] = '[CLS] ' + df_test['sentence1'] + ' [SEP] '
df_test['sent2'] = df_test['sentence2'] + ' [SEP]'

#Apply Bert Tokenizer for tokeinizing
df_train['sent1_t'] = df_train['sent1'].apply(tokenize_bert)
df_train['sent2_t'] = df_train['sent2'].apply(tokenize_bert)
df_dev['sent1_t'] = df_dev['sent1'].apply(tokenize_bert)
df_dev['sent2_t'] = df_dev['sent2'].apply(tokenize_bert)
df_test['sent1_t'] = df_test['sent1'].apply(tokenize_bert)
df_test['sent2_t'] = df_test['sent2'].apply(tokenize_bert)


#Get Token type ids for both sentence
df_train['sent1_token_type'] = df_train['sent1_t'].apply(get_sent1_token_type)
df_train['sent2_token_type'] = df_train['sent2_t'].apply(get_sent2_token_type)
df_dev['sent1_token_type'] = df_dev['sent1_t'].apply(get_sent1_token_type)
df_dev['sent2_token_type'] = df_dev['sent2_t'].apply(get_sent2_token_type)
df_test['sent1_token_type'] = df_test['sent1_t'].apply(get_sent1_token_type)
df_test['sent2_token_type'] = df_test['sent2_t'].apply(get_sent2_token_type)

#Combine both sequences
df_train['sequence'] = df_train['sent1_t'] + df_train['sent2_t']
df_dev['sequence'] = df_dev['sent1_t'] + df_dev['sent2_t']
df_test['sequence'] = df_test['sent1_t'] + df_test['sent2_t']


#Get attention mask
df_train['attention_mask'] = df_train['sequence'].apply(get_sent2_token_type)
df_dev['attention_mask'] = df_dev['sequence'].apply(get_sent2_token_type)
df_test['attention_mask'] = df_test['sequence'].apply(get_sent2_token_type)

#Get combined token type ids for input
df_train['token_type'] = df_train['sent1_token_type'] + df_train['sent2_token_type']
df_dev['token_type'] = df_dev['sent1_token_type'] + df_dev['sent2_token_type']
df_test['token_type'] = df_test['sent1_token_type'] + df_test['sent2_token_type']

#Now make all these inputs as sequential data to be easily fed into torchtext Field.
df_train['sequence'] = df_train['sequence'].apply(combine_seq)
df_dev['sequence'] = df_dev['sequence'].apply(combine_seq)
df_test['sequence'] = df_test['sequence'].apply(combine_seq)
df_train['attention_mask'] = df_train['attention_mask'].apply(combine_mask)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
df_test['attention_mask'] = df_test['attention_mask'].apply(combine_mask)
df_train['token_type'] = df_train['token_type'].apply(combine_mask)
df_dev['token_type'] = df_dev['token_type'].apply(combine_mask)
df_test['token_type'] = df_test['token_type'].apply(combine_mask)
df_train = df_train[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_dev = df_dev[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_test = df_test[['gold_label', 'sequence', 'attention_mask', 'token_type']]

In [None]:
df_train = df_train.loc[df_train['gold_label'].isin(['entailment','contradiction','neutral'])]
df_dev = df_dev.loc[df_dev['gold_label'].isin(['entailment','contradiction','neutral'])]
df_test = df_test.loc[df_test['gold_label'].isin(['entailment','contradiction','neutral'])]

**NC-NLI-1**

In [None]:
df_train.to_csv('/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1/nli_1.0_train.csv', index=False)
df_dev.to_csv('/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1/nli_1.0_dev.csv', index=False)
df_test.to_csv('/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1/nli_1.0_test.csv', index=False)

In [None]:
def convert_to_int(tok_ids):
    tok_ids = [int(x) for x in tok_ids]
    return tok_ids

In [None]:
from torchtext.legacy import data
#For sequence
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
#For label
LABEL = data.LabelField()
#For Attention mask
ATTENTION = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_to_int,
                  pad_token = pad_token_idx)
#For token type ids
TTYPE = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_to_int,
                  pad_token = 1)

fields = [('label', LABEL), ('sequence', TEXT), ('attention_mask', ATTENTION), ('token_type', TTYPE)]

**NC-NLI-1**

In [None]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/drive/MyDrive/NLP_Dissertation/BERT_NLI_Data_Selection/Dataset/nc-nli-1',
                                        train = 'nli_1.0_train.csv',
                                        validation = 'nli_1.0_dev.csv',
                                        test = 'nli_1.0_test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)

In [None]:
train_data_len = len(train_data)
LABEL.build_vocab(train_data)

BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.sequence),
    sort_within_batch = False, 
    device = device)

In [None]:
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch.nn as nn
class BERTNLIModel(nn.Module):
    def __init__(self,

                 bert_model,

                 hidden_dim,

                 output_dim,

                ):
        super().__init__()
        self.bert = bert_model

        embedding_dim = bert_model.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)
    def forward(self, sequence, attn_mask, token_type):
        embedded = self.bert(input_ids = sequence, attention_mask = attn_mask, token_type_ids= token_type)[1]
        output = self.out(embedded)
        return output

In [None]:
HIDDEN_DIM = 512
OUTPUT_DIM = len(LABEL.vocab)
model = BERTNLIModel(bert_model,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                        ).to(device)

In [None]:
from transformers import AdamW, get_constant_schedule_with_warmup
import torch.optim as optim
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
def get_scheduler(optimizer, warmup_steps):
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler
 
criterion = nn.CrossEntropyLoss().to(device)

def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)

    correct = (max_preds.squeeze(1)==y).float()

    return correct.sum() / len(y)

max_grad_norm = 1

def train(model, iterator, optimizer, criterion, scheduler):
    #print(iterator)
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        
        sequence = batch.sequence
        attn_mask = batch.attention_mask
        token_type = batch.token_type
        label = batch.label
        
        predictions = model(sequence, attn_mask, token_type)
        
        loss = criterion(predictions, label)
        
        acc = categorical_accuracy(predictions, label)
        
        if mp:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
        else:
            loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    #print(iterator)
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            #print(batch)

            sequence = batch.sequence
            attn_mask = batch.attention_mask
            token_type = batch.token_type
            labels = batch.label
                        
            predictions = model(sequence, attn_mask, token_type)
            
            loss = criterion(predictions, labels)
                
            acc = categorical_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)  

In [None]:
import time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

import math
N_EPOCHS = 3
mp = False
warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS*train_data_len*1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-nli.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 46s
	Train Loss: 1.040 | Train Acc: 44.44%
	 Val. Loss: 0.989 |  Val. Acc: 50.20%
Epoch: 02 | Epoch Time: 0m 47s
	Train Loss: 0.789 | Train Acc: 64.01%
	 Val. Loss: 0.886 |  Val. Acc: 59.18%
Epoch: 03 | Epoch Time: 0m 48s
	Train Loss: 0.456 | Train Acc: 81.71%
	 Val. Loss: 1.181 |  Val. Acc: 55.47%


In [None]:
model.load_state_dict(torch.load('bert-nli.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.924 |  Test Acc: 55.37%


In [None]:
def predict_inference(premise, hypothesis, model, device):
    model.eval()
    premise = '[CLS] ' + premise + ' [SEP]'
    hypothesis = hypothesis + ' [SEP]'
    prem_t = tokenize_bert(premise)
    hypo_t = tokenize_bert(hypothesis)
    prem_type = get_sent1_token_type(prem_t)
    hypo_type = get_sent2_token_type(hypo_t)
    indexes = prem_t + hypo_t
    indexes = tokenizer.convert_tokens_to_ids(indexes)
    indexes_type = prem_type + hypo_type
    attn_mask = get_sent2_token_type(indexes)
    indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
    indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
    attn_mask = torch.LongTensor(attn_mask).unsqueeze(0).to(device)
    prediction = model(indexes, attn_mask, indexes_type)
    prediction = prediction.argmax(dim=-1).item()
    return LABEL.vocab.itos[prediction]

# **evaluation on nli_probe**

In [None]:
df_test_pred = pd.read_excel('/content/drive/MyDrive/NLP_Dissertation/Datasets/nli_probe.xlsx')

In [None]:
pred = []
for i in range(0,len(df_test_pred)): 
  lbl = predict_inference(df_test_pred['sentence1'][i], df_test_pred['sentence2'][i], model, device)
  pred.append(lbl)

In [None]:
acc = accuracy_score(df_test_pred['label'],pred)
acc

0.7716417910447761

In [None]:
cm = pd.DataFrame(confusion_matrix(df_test_pred['label'],pred), columns=['contradiction', 'entailment', 'neutral'], 
                  index=['contradiction', 'entailment', 'neutral'])
print(cm)

               contradiction  entailment  neutral
contradiction              0           0        0
entailment                 2         517      151
neutral                    0           0        0


In [None]:
#for i,j in zip(list(df_test_pred['gold_label']), pred):
#  print(i,j)

pred_data = []
for index, (first, second) in enumerate(zip(list(df_test_pred['label']), pred)):
    if first != second:
        pred_data.append([df_test_pred['sentence1'][index],df_test_pred['sentence2'][index],df_test_pred['label'][index], pred[index]])

df_mis = pd.DataFrame(pred_data, columns=['sent1', 'sent2', 'gold_lbl', 'pred_lbl'])

In [None]:
df_mis

Unnamed: 0,sent1,sent2,gold_lbl,pred_lbl
0,"So... be brave, be ready to change your views....","So... be brave, be ready to change your views....",entailment,neutral
1,We also have a MacLarens baby buggy and pottie...,We also have a MacLarens pram and potties and ...,entailment,neutral
2,"However, Whitehall thinks that I am a thorough...","However, Whitehall thinks that I am a thorough...",entailment,neutral
3,Prince came onboard to have a large benign tum...,Prince came onboard to have a large noncancero...,entailment,neutral
4,You may occasionally enjoy a good spirited deb...,You may occasionally enjoy a good spirited deb...,entailment,neutral
...,...,...,...,...
148,"daily , but other information e.g. on the term...","daily , but other information e.g. on the term...",entailment,neutral
149,Arts Employment How to Apply Entry only by app...,Arts Employment How to Apply Entry only by app...,entailment,neutral
150,"documents A : Plans , Decision Notices ( compl...","documents A : Plans , Decision Notices ( compl...",entailment,neutral
151,feel as if this baby is more important than th...,feel as if this baby is more important than th...,entailment,neutral


# **evaluation on nli_pet**

In [None]:
df_test_pred = pd.read_csv('/content/drive/MyDrive/NLP_Dissertation/Datasets/nli_pet.csv')

In [None]:
pred = []
for i in range(0,len(df_test_pred)): 
  lbl = predict_inference(df_test_pred['sentence1'][i], df_test_pred['sentence2'][i], model, device)
  pred.append(lbl)

In [None]:
acc = accuracy_score(df_test_pred['gold_label'],pred)
acc

0.43134328358208956

In [None]:
cm = pd.DataFrame(confusion_matrix(df_test_pred['gold_label'],pred), columns=['contradiction','entailment', 'neutral'], 
                  index=['contradiction', 'entailment', 'neutral'])
print(cm)

               contradiction  entailment  neutral
contradiction              2         249       21
entailment                 0         279        8
neutral                    1         102        8


In [None]:
#for i,j in zip(list(df_test_pred['gold_label']), pred):
#  print(i,j)

pred_data = []
for index, (first, second) in enumerate(zip(list(df_test_pred['gold_label']), pred)):
    if first != second:
        pred_data.append([df_test_pred['sentence1'][index],df_test_pred['sentence2'][index],df_test_pred['gold_label'][index], pred[index]])

df_mis = pd.DataFrame(pred_data, columns=['sent1', 'sent2', 'gold_lbl', 'pred_lbl'])

In [None]:
df_mis

Unnamed: 0,sent1,sent2,gold_lbl,pred_lbl
0,Things that happened almost a year ago is anci...,Ancient history is always literally a history ...,contradiction,entailment
1,The days of triple-digit growth now seem like ...,Ancient history is always literally a history ...,contradiction,entailment
2,"So... be brave, be ready to change your views....",Ancient history is always literally a history ...,contradiction,entailment
3,One month later still another bad hat came al...,A bad hat is always literally a hat that is bad.,contradiction,entailment
4,"The jury heard the evidence presented, that he...",A bad hat is always literally a hat that is bad.,contradiction,entailment
...,...,...,...,...
376,This is seen as an acid test of the Government...,An acid test is always literally a test that i...,contradiction,entailment
377,are never implemented . Redistribution of cent...,An acid test is always literally a test that i...,contradiction,entailment
378,moment for him and I felt he came out of it ve...,An acid test is always literally a test that i...,contradiction,entailment
379,"such markers ' whilst being ungodly even , unc...",An acid test is always literally a test that i...,contradiction,entailment


# References

1. https://github.com/codemunic/Natural-Language-Inference

2. Devlin, J., Chang, M.-W., Lee, K., and Toutanova, K.Bert: Pre-training of deepbidirectional transformers for language understanding.arXiv preprint arXiv:1810.04805(2018).

3. https://github.com/bentrevett/pytorch-sentiment-analysis