### This is the training file and code for paired inputs

In [None]:
import pandas as pd
import os, time
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import BertForSequenceClassification, BertTokenizerFast, LongformerForSequenceClassification

# specify GPU
device = torch.device("cuda")

In [None]:
train_df = pd.read_csv("./final_train.csv")
train_label_df = pd.read_csv("./final_train_labels.csv")
train_df = train_df[['query', 'p1', 'p2', 'p3']]
train_label_df = train_label_df[['label']]

val_df = pd.read_csv("./final_val_text.csv")
val_label_df = pd.read_csv("./final_val_labels.csv")
val_df = val_df[['query', 'p1', 'p2', 'p3']]
val_label_df = val_label_df[['label']]

test_df = pd.read_csv("./final_test_text.csv")
test_label_df = pd.read_csv("./final_test_labels.csv")
test_df = test_df[['query', 'p1', 'p2', 'p3']]
test_label_df = test_label_df[['label']]


df_train = pd.concat([train_df, train_label_df], axis=1)
df_val = pd.concat([val_df, val_label_df], axis=1)
df_test = pd.concat([test_df, test_label_df], axis=1)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer

class TrainingDataBert(Dataset):
    def __init__(self, train_df, val_df, tokenizer):
        print("Loading Datasets")
        self.train_df = train_df
        self.val_df = val_df

        self.rejected = 0
        self.tokenizer = tokenizer

        self.train_data = None
        self.val_data = None
        self.count = 0
        self.init_data()
        print("Datasets loaded")
        
    def init_data(self):
        print("-"*8+" Creating data  "+"-"*8)
        self.train_data = self.load_data(self.train_df)
        print("-"*8+" Train data created "+"-"*8+"\n")
        self.val_data = self.load_data(self.val_df)
        print("-"*8+" Val data created "+"-"*8+"\n")
        
    def load_data(self, df):
        def create_ids_and_mask(premise, hypothesis, label):
            outputs = self.tokenizer([premise, hypothesis], add_special_tokens=True, max_length = 512, pad_to_max_length=True, truncation=True,)
            attention_mask_ids = outputs['attention_mask'][0]
            token_ids = outputs['input_ids'][0]
            
            final_token_ids.append(token_ids)
            final_mask_ids.append(attention_mask_ids)
            final_y.append(label)
            
        MAX_LEN = 512 - 3
        final_token_ids = []
        final_mask_ids = []
        final_y = []
    
        premise_list = df['query'].to_list()
        hypothesis1_list = df['p1'].to_list()
        hypothesis2_list = df['p2'].to_list()
        hypothesis3_list = df['p3'].to_list()
        label_list = df['label'].to_list()
        
        for (premise, p1, p2, p3, label) in zip(premise_list, hypothesis1_list, hypothesis2_list, hypothesis3_list, label_list):
            len1 = len(premise) + len(p1)
            len2 = len(premise)+ len(p2)
            len3 = len(premise)+ len(p3)
            if (len1 <= MAX_LEN and len2 <= MAX_LEN and len3 <= MAX_LEN):
                create_ids_and_mask(premise=premise, hypothesis=p1, label=label)
                self.count += 1
            else:
                self.rejected += 1
    
        final_token_ids = torch.tensor(final_token_ids)
        final_mask_ids = torch.tensor(final_mask_ids)
        final_y = torch.tensor(final_y)

        dataset = TensorDataset(final_token_ids, final_mask_ids, final_y)
        print("Rejected:",self.rejected)
        print("Dataset size:", df.shape[0]-self.rejected)
        return dataset
    
    def get_data_loaders(self, bs, shuffle=True):
        print("Generating dataloader")
        train_loader = DataLoader(
          self.train_data,
          shuffle=shuffle,
          batch_size=bs,
        )

        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=bs,
        )
        print("Generated dataloaders")
        return train_loader, val_loader

dataset = TrainingDataBert(df_train, df_val, tokenizer)
train_dataset, val_dataset = dataset.get_data_loaders(bs=7)

In [None]:
from torch.optim import Adam
import time

num_epochs = 10
model.to(device)
optimizer = Adam(model.parameters(), lr=5e-5)

def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc


def train(model, train_loader, val_loader, optimizer):
  print("-"*8+" start training "+"*"*8)
  best_valid_loss = float('inf')
  total_step = len(train_loader)
    
  for epoch in range(num_epochs):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0

    print("Training phase:",epoch)
    for batch_idx, batch in enumerate(train_loader):
      optimizer.zero_grad()
      torch.cuda.empty_cache()
      
      if batch_idx % 100 == 0 and not batch_idx == 0:
          print('  Batch {:>5,}  of  {:>5,}.'.format(batch_idx, len(train_loader)))
          print("Avg Loss:",str(total_train_loss/batch_idx), " - ", "Avg Acc:",str(total_train_acc/batch_idx))
        
      text_ids, mask_ids, y = [r.to(device) for r in batch]

      train_loss, prediction = model(input_ids=text_ids, attention_mask=mask_ids, labels=y).values()

      train_acc = multi_acc(prediction, y)
      train_loss.backward()
      optimizer.step()
      # lr_scheduler.step()
       
      total_train_loss += train_loss.item()
      total_train_acc  += train_acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    
    print("Evaluation phase:",epoch)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, batch in enumerate(val_loader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        if batch_idx % 100 == 0 and not batch_idx == 0:
          print('  Batch {:>5,}  of  {:>5,}.'.format(batch_idx, len(val_loader)))
        
        text_ids, mask_ids, y = [r.to(device) for r in batch]

        loss, prediction = model(input_ids=text_ids,
                             attention_mask=mask_ids, 
                             labels=y).values()
        
        acc = multi_acc(prediction, y)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()
    
    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'bert_paired.pt')
    
    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("Time taken:",str(start-end))

In [None]:
train(model, train_dataset, val_dataset, optimizer)