In [1]:
import pandas as pd
import os, time
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import BertForSequenceClassification, BertTokenizerFast, LongformerForSequenceClassification

# specify GPU
device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv("./final_train.csv")
train_label_df = pd.read_csv("./final_train_labels.csv")
train_df = train_df[['query', 'p1', 'p2', 'p3']]
train_label_df = train_label_df[['label']]

val_df = pd.read_csv("./final_val_text.csv")
val_label_df = pd.read_csv("./final_val_labels.csv")
val_df = val_df[['query', 'p1', 'p2', 'p3']]
val_label_df = val_label_df[['label']]

test_df = pd.read_csv("./final_test_text.csv")
test_label_df = pd.read_csv("./final_test_labels.csv")
test_df = test_df[['query', 'p1', 'p2', 'p3']]
test_label_df = test_label_df[['label']]


df_train = pd.concat([train_df, train_label_df], axis=1)
df_val = pd.concat([val_df, val_label_df], axis=1)
df_test = pd.concat([test_df, test_label_df], axis=1)


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import BertTokenizer

class base_case_test_dataset(Dataset):
    def __init__(self, test_df):
        self.rejected = 0
        self.count = 0
        print("Loading Datasets")
        self.label_dict = {0: 0, 1: 1}
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.test_data = self.load_data(self.test_df)
        print("Datasets loaded")
        
    def load_data(self, df):
        def create_ids_and_mask(premise, label, passage=None):
            if (passage):
                outputs = self.tokenizer([[premise, passage]], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
                attention_mask_ids = outputs['attention_mask'][0]
                token_ids = outputs['input_ids'][0]
                return token_ids, attention_mask_ids, label
            else:
                outputs = self.tokenizer([premise], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
                attention_mask_ids = outputs['attention_mask'][0]
                token_ids = outputs['input_ids'][0]
                return token_ids, attention_mask_ids, label
        
        MAX_LEN = 512 - 3
        final_token_ids = []
        final_mask_ids = []
        final_y = []
    
        premise_list = df['query'].to_list()
        hypothesis1_list = df['p1'].to_list()
        hypothesis2_list = df['p2'].to_list()
        hypothesis3_list = df['p3'].to_list()
        label_list = df['label'].to_list()
        
        for (premise, p1, p2, p3, label) in zip(premise_list, hypothesis1_list, hypothesis2_list, hypothesis3_list, label_list):
            len1 = len(premise) + len(p1)
            len2 = len(premise)+ len(p2)
            len3 = len(premise)+ len(p3)
            if (len1 <= MAX_LEN and len2 <= MAX_LEN and len3 <= MAX_LEN):
                outputs = self.tokenizer([premise], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
                attention_mask_ids = outputs['attention_mask'][0]
                token_ids = outputs['input_ids'][0]
                final_token_ids.append(token_ids)
                final_mask_ids.append(attention_mask_ids)
                final_y.append(label)
                
            else:
                self.rejected += 1
            
        final_token_ids = torch.tensor(final_token_ids)
        final_mask_ids = torch.tensor(final_mask_ids)
        final_y = torch.tensor(final_y)

        dataset = TensorDataset(final_token_ids, final_mask_ids, final_y)
        print("Rejected:",self.rejected)
        print("Dataset size:", df.shape[0]-self.rejected)
        return dataset
        
    def get_data_loaders(self, bs, shuffle=True):
        print("Generating dataloader")
        return DataLoader(
          self.test_data,
          shuffle=shuffle,
          batch_size=bs,
        )

full_test_df = pd.concat([test_df, val_df], axis=0)
full_test_label = pd.concat([test_label_df, val_label_df], axis=0)
df_test = pd.concat([full_test_df, full_test_label], axis=1)

base_case_test_ds = base_case_test_dataset(df_test)

Loading Datasets
Rejected: 819
Dataset size: 6590
Datasets loaded


In [12]:
test_dataset = base_case_test_ds.get_data_loaders(bs=1)

Generating dataloader


In [40]:
def predict(model, loader, path=None):
    torch.cuda.empty_cache()
    if path: model.load_state_dict(torch.load(path))
    torch.cuda.empty_cache()
    
    all_preds = []
    all_labels = []
    model.eval()
    
    with torch.no_grad():
        for step,batch in enumerate(loader):
            sent_id, mask, label = [r.to(device) for r in batch]
            logits = model(input_ids=sent_id, attention_mask=mask, labels=label)
            preds = logits[1].max(1).indices
            preds = preds.detach().cpu().numpy()
            
            batch_labels = label.detach().cpu().numpy()
            all_labels.append(batch_labels[0])
            all_preds.append(preds[0])
            
    return all_preds, all_labels
    
def generate_result(all_preds, test_y):
    preds_t = torch.tensor(all_preds, dtype=int)
    new_y = torch.tensor(test_y, dtype=int)
    
    correct = (new_y == preds_t)
    accuracy = correct.sum() / correct.size()[0]
    
    print(accuracy)
    print(classification_report(test_y, preds_t))

In [41]:
model.load_state_dict(torch.load('./bert_single_query_new.pt'))
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [49]:
# Seed
import random
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

In [48]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer

# data set generator for single query
class TrainingDataBert(Dataset):
    def __init__(self, train_df, val_df, tokenizer):
        print("Loading Datasets")
        self.train_df = train_df
        self.val_df = val_df

        self.rejected = 0
        self.tokenizer = tokenizer

        self.train_data = None
        self.val_data = None
        self.count = 0
        self.init_data()
        print("Datasets loaded")
        
    def init_data(self):
        print("-"*8+" Creating data  "+"-"*8)
        self.train_data = self.load_data(self.train_df)
        print("-"*8+" Train data created "+"-"*8+"\n")
        self.val_data = self.load_data(self.val_df)
        print("-"*8+" Val data created "+"-"*8+"\n")
        
    def load_data(self, df):
        def create_ids_and_mask(premise, hypothesis, label):
            # outputs = self.tokenizer([hypothesis, premise], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
            outputs = self.tokenizer([premise], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
            attention_mask_ids = outputs['attention_mask'][0]
            token_ids = outputs['input_ids'][0]
            
            final_token_ids.append(token_ids)
            final_mask_ids.append(attention_mask_ids)
            final_y.append(label)
            
        MAX_LEN = 512 - 3
        final_token_ids = []
        final_mask_ids = []
        final_y = []
    
        premise_list = df['query'].to_list()
        hypothesis1_list = df['p1'].to_list()
        hypothesis2_list = df['p2'].to_list()
        hypothesis3_list = df['p3'].to_list()
        label_list = df['label'].to_list()
        
        for (premise, p1, p2, p3, label) in zip(premise_list, hypothesis1_list, hypothesis2_list, hypothesis3_list, label_list):
            len1 = len(premise) + len(p1)
            len2 = len(premise)+ len(p2)
            len3 = len(premise)+ len(p3)
            if (len1 <= MAX_LEN and len2 <= MAX_LEN and len3 <= MAX_LEN):
                create_ids_and_mask(premise=p1, hypothesis=premise, label=label)
                self.count += 1
            else:
                self.rejected += 1
    
        final_token_ids = torch.tensor(final_token_ids)
        final_mask_ids = torch.tensor(final_mask_ids)
        final_y = torch.tensor(final_y)

        dataset = TensorDataset(final_token_ids, final_mask_ids, final_y)
        print("Rejected:",self.rejected)
        print("Dataset size:", df.shape[0]-self.rejected)
        return dataset
    
    def get_data_loaders(self, bs, shuffle=True):
        print("Generating dataloader")
        train_loader = DataLoader(
          self.train_data,
          shuffle=shuffle,
          batch_size=bs,
        )

        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=bs,
        )
        print("Generated dataloaders")
        return train_loader, val_loader

dataset = TrainingDataBert(df_test, df_val, tokenizer)
test_dataset, val_dataset = dataset.get_data_loaders(bs=1)

Loading Datasets
-------- Creating data  --------




Rejected: 819
Dataset size: 6590
-------- Train data created --------

Rejected: 1217
Dataset size: 2487
-------- Val data created --------

Datasets loaded
Generating dataloader
Generated dataloaders


In [46]:
all_preds, all_labels = predict(model, test_dataset)
generate_result(all_preds,all_labels)

tensor(0.7942)
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      4106
           1       0.72      0.74      0.73      2484

    accuracy                           0.79      6590
   macro avg       0.78      0.78      0.78      6590
weighted avg       0.80      0.79      0.79      6590



In [47]:
!nvidia-smi

Thu May 26 19:48:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.79       Driver Version: 511.79       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   40C    P8    11W / 310W |   3066MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
model.load_state_dict(torch.load('./bert_paired_query_new.pt'))
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [51]:
all_preds, all_labels = predict(model, test_dataset)
generate_result(all_preds,all_labels)

KeyboardInterrupt: 