# E2 = Experiment 2 on T5
- Changed Question Length to 150 (from 250)
- Changed Answer Length to 500 (from 32)
- Changed Batch to 4 (from 2)
- Changing Training Approach.

# Install Necessary Libararies

In [1]:
# !pip install -Uq transformers
# !pip install -Uq evaluate
# !pip install -Uq SentencePiece

In [2]:
# !pip install accelerate>=0.20.1
# !pip install transformers[torch]
# # You need to restart the kernel after this step

# Load Configuration & Library

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
apipath = r'H:\\My Drive\\config\\hbqa.txt'
apipath = r'/content/drive/MyDrive/config/hbqa-colab.txt'
import configparser

config = configparser.ConfigParser()
config.read(apipath)
secret_key = config['global']['OPENAI_KEY']
datapath = config['global']['DATA_FOLDER']
corpuspath = config['global']['CORPUS_FOLDER']
PINECONE_API_KEY = config['global']['PINECONE_KEY']
PINECONE_ENV = config['global']['PINECONE_ENV']

# PINECONE is Vector Database. To store the vector so that we can quickly search the vector space.
# https://app.pinecone.io
# get PINECONE_API_KEY key from app.pinecone.io
# find your PINECONE_ENVIRONMENT next to the api key in pinecone console

modelpath  = "/content/drive/MyDrive/HBQA/T5QA_E2"

In [5]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import torch.nn.functional as F


import transformers
import evaluate  # Bleu
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [6]:
# # Detect and initialize TPU
# tpu_available = tf.config.experimental.list_logical_devices("TPU")
# if tpu_available:
#     print("TPU available")
# else:
#     print("No TPU available")

In [7]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

print(DEVICE)

cpu


In [8]:
# sample code
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# TOKENIZER = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL.to(DEVICE)

# Loading Dataset

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv(datapath+"06-HBQA_Manual_with_Chunk.csv")

In [11]:
print(df.shape)
df.head(4)

(1104, 9)


Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,Reference,WordsInQues,WordsInAns
0,10000,10193,Book01_046,Why did the Muni wander over the earth and wee...,The Muni wandered over the earth and wept loud...,"Sauti continued, 'The Muni, having said so unt...","""'O ye, being directed by my ancestors, I am r...",14,25
1,10001,10193,Book01_046,Who did the Muni ask for a bride and why?,The Muni asked for a bride from the creatures ...,"Sauti continued, 'The Muni, having said so unt...","""My ancestors, afflicted with grief, have dire...",10,26
2,10002,10193,Book01_046,Who informed Vasuki about the Muni's desire fo...,The snakes that had been set upon Jaratkaru's ...,"Sauti continued, 'The Muni, having said so unt...","""Then those snakes that had been set upon Jara...",10,20
3,10003,10193,Book01_046,How did Vasuki respond when he heard about the...,"Vasuki took a maiden, who was decked with orna...","Sauti continued, 'The Muni, having said so unt...","""And the king of the snakes, hearing their wor...",14,22


In [12]:
Question_Len = int(max(len(ques) for ques in df.Question)/4)
Answer_Len = int(max(len(ans) for ans in df.Ref_Answer)/4)
Question_Len,Answer_Len # in Tokens

(34, 222)

In [13]:
Q_LEN =  150 #256   # Question Length
T_LEN =  500 #32  # Target Length
BATCH_SIZE = 4
# DEVICE = "cuda:0"

# Load Base Model for Finetuning

In [14]:
class QA_Dataset(Dataset):
  def __init__(self, tokenizer, dataframe, q_len, t_len):
      self.tokenizer = tokenizer
      self.q_len = q_len
      self.t_len = t_len
      self.data = dataframe
      self.questions = self.data["Question"]
      self.context = self.data["Chunk"]
      self.answer = self.data['Ref_Answer']

  def __len__(self):
      return len(self.questions)

  def __getitem__(self, idx):
      question = self.questions[idx]
      context = self.context[idx]
      answer = self.answer[idx]

      question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                  truncation=True, pad_to_max_length=True, add_special_tokens=True)
      answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                        truncation=True, pad_to_max_length=True, add_special_tokens=True)

      labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
      labels[labels == 0] = -100

      return {
          "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
          "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
          "labels": labels,
          "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
      }

In [15]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
model.to(DEVICE)
optimizer = Adam(model.parameters(), lr=0.00001)

In [None]:
# # Earlier Experiment code
# # Dataloader

# train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# train_sampler = RandomSampler(train_data.index)
# val_sampler = RandomSampler(val_data.index)

# qa_dataset = QA_Dataset(tokenizer, df, Q_LEN, T_LEN)

# train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
# val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [25]:
#E2 Experiment Code
# Dataloader

train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

train_dataset = QA_Dataset(tokenizer, df.loc[train_sampler,:], Q_LEN, T_LEN)
val_dataset = QA_Dataset(tokenizer, df.loc[val_sampler,:], Q_LEN, T_LEN)

# train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
# val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

# Start finetune (Training)

## New Approach

In [17]:
from transformers import Trainer, TrainingArguments


In [None]:
# # Set up the training arguments
training_args = TrainingArguments(
    output_dir=modelpath,
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    save_steps=10000, # Set the frequency to save the model
    eval_steps=10000,  # Set the frequency to evaluate the model
    save_total_limit=1, # Limit the number of saved checkpoints to 1 (best model)
    logging_dir='./logs', # learning_rate=2e-5,
    evaluation_strategy="epoch",  # Change this to "steps"/ "epoch" if you want to evaluate at the end of each epoch
    remove_unused_columns=False,
    push_to_hub=False,  # Set to True if you want to push the model to the Hugging Face Hub
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

)

trainer.train()




Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


## Another Method (used in experiment 1)

In [None]:
# train_loss = 0
# val_loss = 0
# train_batch_count = 0
# val_batch_count = 0

# for epoch in range(10):
#     model.train()
#     for batch in tqdm(train_loader, desc="Training batches"):
#         input_ids = batch["input_ids"].to(DEVICE)
#         attention_mask = batch["attention_mask"].to(DEVICE)
#         labels = batch["labels"].to(DEVICE)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

#         outputs = model(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         optimizer.zero_grad()
#         outputs.loss.backward()
#         optimizer.step()
#         train_loss += outputs.loss.item()
#         train_batch_count += 1

#     #Evaluation
#     model.eval()
#     for batch in tqdm(val_loader, desc="Validation batches"):
#         input_ids = batch["input_ids"].to(DEVICE)
#         attention_mask = batch["attention_mask"].to(DEVICE)
#         labels = batch["labels"].to(DEVICE)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

#         outputs = MODEL(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         optimizer.zero_grad()
#         outputs.loss.backward()
#         optimizer.step()
#         val_loss += outputs.loss.item()
#         val_batch_count += 1

#     print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

# Save/ Load Finetuned (trained) Model

## Save Finetuned (trained) Model

In [None]:
model.save_pretrained(modelpath)
tokenizer.save_pretrained(modelpath)

# Saved files
"""('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/spiece.model',
'qa_tokenizer/added_tokens.json',
'qa_tokenizer/tokenizer.json')"""

## Load Model from Memory

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the corresponding tokenizer
TOKENIZER = T5Tokenizer.from_pretrained(modelpath)

# Load the pre-trained T5 model
MODEL = T5ForConditionalGeneration.from_pretrained(modelpath)
MODEL.to(DEVICE)

In [None]:
# input_text = "Translate the following English text to French: 'Hello, how are you?'"

# # Tokenize the input text
# input_ids = tokenizer.encode(input_text, return_tensors='pt')
# input_ids = input_ids.to(DEVICE)

# # Generate translated text
# translated_ids = MODEL.generate(input_ids)

# # Decode the generated IDs back to text
# translated_text = TOKENIZER.decode(translated_ids[0], skip_special_tokens=True)

# print("Translated Text:", translated_text)


# Predict Answers from t5 model

In [None]:
def predict_answer(context, question):

    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    return predicted_answer

## Check Answers

In [None]:
import random
qno=[]
N=5

for i in range(N):
    qno.append(random.randint(0,df.shape[0] ) )

pred_answers=[]
ref_answers=[]
question=[]

for i in qno:
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    question.append(ques)

    print('Question  :', ques)
    print("Ref Answer:", ref_ans)
    print("Pred Ans  :", pred_ans)
    print('--------')

## Predict All Answer & Save Predictions

In [None]:
import random
pred_answers=[]
ref_answers=[]
questions=[]
for i in range(df.shape[0]):
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    questions.append(ques)

    # print('Question  :', ques)
    # print("Ref Answer:", ref_ans)
    # print("Pred Ans  :", pred_ans)
    # print('--------')
    print(f"Predicting ans for question {df.iloc[i]['Ques_Id']}")
    df.loc[i,'T5Pred_Answer'] = pred_ans

In [None]:
df[['Ques_Id','Ref_Answer','T5_Pred_Answer']].to_csv(datapath + '09.11-t5Predicted_ans.csv')
# df= pd.read_csv(datapath + '11.1-t5Predicted_ans.csv')

# Calculate and Save Metrics - 1

In [None]:
def calculate_score(ref_answer, predicted_answer):

  pred_answer_ids = TOKENIZER.encode(predicted_answer, return_tensors='pt')[0][0]
  pred_answer_ids = pred_answer_ids.to(DEVICE)

  ref_answer_ids = TOKENIZER.encode(ref_answer, return_tensors='pt')[0][0]
  ref_answer_ids = pred_answer_ids.to(DEVICE)

  bleu = evaluate.load("google_bleu")
  bleu_score1  = bleu.compute(predictions=[predicted_answer], references=[ref_answer])

  # squad = evaluate.load("squad")
  glue_qqp = evaluate.load('glue', 'qqp')

  glue_qqp_score1 = glue_qqp.compute(predictions=[pred_answer_ids],
                      references=[ref_answer_ids])

  return bleu_score1, glue_qqp_score1 #squad_score1 #bleu_score1#, squad_score1, glue_score1

In [None]:
df1 = pd.DataFrame(columns = ['Ques_Id','BLEU_Score','GLUE_Acc','GLUE_F1'])

In [None]:
bleu_score=[]
glue_qqp_score=[]
from datasets import load_metric

N= df.shape[0]

for i in range(N):
  # bleu_score1 = calculate_score( df1.iloc[i]['ref_answer'],df1.iloc[i]['pred_answer'])
  ques_id = df.loc[i,'Ques_Id']
  ref_ans  = df.loc[i,'Ref_Answer']
  pred_ans = df.loc[i,'T5_Pred_Answer']

  bleu, glue = calculate_score(ref_ans, pred_ans)
  print(bleu,glue,ques_id)

  bleu_score.append(bleu['google_bleu'])
  glue_qqp_score.append(glue['accuracy'])
  glue_qqp_score.append(glue['f1'])

  df1.loc[i] = (ques_id, bleu['google_bleu'], glue['accuracy'], glue['f1'] )

In [None]:
df1.sample(5)

In [None]:
df1.to_csv(datapath + '09.12-t5Predicted_Ans_Score_E2.csv')

In [None]:
df1[[ 'BLEU_Score', 'GLUE_Acc', 'GLUE_F1']].mean()

# Embedding/Vecotrizing Predicted Answer

In [None]:
df_t5Predicted_Ans = pd.read_csv(datapath+"09.11-t5Predicted_ans_E2.csv")
df_t5Predicted_Ans

In [None]:
!pip install -Uq sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
%time
T5PredAns_Sentences = df_t5Predicted_Ans.T5_Pred_Answer.tolist()
T5PredAns_Embeddings = model.encode(T5PredAns_Sentences)

T5PredAnsVectors= torch.tensor(T5PredAns_Embeddings, dtype=torch.float).to(DEVICE)

T5PredAnsVec_list = T5PredAnsVectors.tolist()

df_t5Predicted_Ans['T5_AnsVector'] = T5PredAnsVec_list

df_t5Predicted_Ans.to_csv(datapath+'09.11-t5Predicted_AnsVector_E2')

In [None]:
print(df_t5Predicted_Ans.shape)
df_t5Predicted_Ans.head(3)

# Calculate and Save Metrics -2

In [None]:
df_t5Predicted_Ans = pd.read_csv(datapath+'09.11-t5Predicted_AnsVector', usecols=['Ques_Id','T5_AnsVector'])

In [None]:
df_ref_ans = pd.read_csv(datapath + '07.2-HBQA_QA_Vector.csv', usecols=['Ques_Id','AnsVector'])

In [None]:
df_combined=df_t5Predicted_Ans.merge(df_ref_ans, on="Ques_Id", how="inner")

In [None]:
print(df_combined.shape)
df_combined.head(3)


In [None]:

def calculate_cosine(row):
  # Remove the brackets and split the string by commas
  predAns_vector_values = row['T5_AnsVector'].strip('[]').split(',')
  refAns_vector_values  = row['AnsVector'].strip('[]').split(',')


  # Convert the string values to floats
  predAns_vector_values = [float(value) for value in predAns_vector_values]
  refAns_vector_values  = [float(value) for value in refAns_vector_values]


  # Convert tensor
  predAns_vector_values = torch.tensor(predAns_vector_values).reshape(1, -1)
  refAns_vector_values = torch.tensor(refAns_vector_values).reshape(1, -1)

  # print(predAns_vector_values)
  # print('----')
  # print (refAns_vector_values)
  # Calculate Cosine
  return round(F.cosine_similarity(predAns_vector_values,refAns_vector_values).item(),3)


In [None]:
for rowid in df_combined.index:
  df_combined.loc[rowid,'CosineSim'] = calculate_cosine( df_combined.loc[rowid])

In [None]:
df_combined.CosineSim.mean()

In [None]:
# sst2, mnli, mnli_mismatched, mnli_matched, qnli, rte, wnli, cola,stsb, mrpc, qqp, and hans.

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'mrpc')  # 'mrpc' or 'qqp'
# references = [0, 1]
# predictions = [0, 1]
# results = glue_metric.compute(predictions=predictions, references=references)
# print(results)

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'stsb')
# references = [0., 1., 2., 3., 4., 5.]
# predictions = [-10., -11., -12., -13., -14., -15.]
# results = glue_metric.compute(predictions=predictions, references=references)
# print(results)

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'cola')
# references = [0, 1]
# predictions = [1, 1]
# results = glue_metric.compute(predictions=predictions, references=references)
# results