# Install Necessary Libararies

In [None]:
!python3 -m pip install -Uq transformers
!python3 -m pip install -Uq evaluate
!python3 -m pip install -Uq SentencePiece

# Load Configuration & Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
apipath = r'H:\\My Drive\\config\\hbqa.txt'
# apipath = r'/content/drive/MyDrive/config/hbqa-colab.txt'
import configparser

config = configparser.ConfigParser()
config.read(apipath)

secret_key = config['global']['OPENAI_KEY']
datapath = config['global']['DATA_FOLDER']
corpuspath = config['global']['CORPUS_FOLDER']
corpus_sectionpath = config['global']['CORPUS_FOLDER_SECTIONS']
PINECONE_API_KEY = config['global']['PINECONE_API_KEY']
PINECONE_ENV = config['global']['PINECONE_ENV']
CHATGPT_KEY = config['global']['CHATGPT_KEY']

# PINECONE is Vector Database. To store the vector so that we can quickly search the vector space.
# https://app.pinecone.io
# get PINECONE_API_KEY key from app.pinecone.io
# find your PINECONE_ENVIRONMENT next to the api key in pinecone console

In [5]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# import tensorflow as tf
import torch.nn.functional as F


import transformers
import evaluate  # Bleu
from transformers import T5Tokenizer, T5Model, T5ForQuestionAnswering,  T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [None]:
# # Detect and initialize TPU
# tpu_available = tf.config.experimental.list_logical_devices("TPU")
# if tpu_available:
#     print("TPU available")
# else:
#     print("No TPU available")

In [7]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

DEVICE

device(type='cpu')

In [None]:
# sample code
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# TOKENIZER = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL.to(DEVICE)

# Loading Dataset

In [8]:
import pandas as pd

In [24]:
df = pd.read_csv(datapath+"06-HBQA_Manual_with_Chunk.csv")
df['WordsInQues_Chunk']= df['WordsInQues'] + df['WordsInChunk']

In [25]:
print(df.shape)
df.head(4)

(1003, 12)


Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,Reference,WordsInQues,WordsInAns,WordsInRef,WordsInChunk,WordsInQues_Chunk
0,0,389,Book03_002,What is the significance of performing the Agn...,Performing the Agnihotra is considered importa...,Even this is eternal morality. They that perfo...,The significance of the Agnihotra and the cons...,16,50,50,809,825
1,1,390,Book03_003,"What predicament does Yudhishthira face, and h...",Yudhishthira faces the predicament of being un...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Yudhishthira's predicament and his consultatio...,14,41,53,852,866
2,2,390,Book03_003,What advice does Dhaumya offer to Yudhishthira...,Dhaumya advises Yudhishthira to take refuge in...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Dhaumya's advice to Yudhishthira is provided i...,10,51,30,852,862
3,3,390,Book03_003,"How did Yudhishthira adore the sun, and what i...",Yudhishthira adored the sun by performing ablu...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Yudhishthira's adoration of the sun and the si...,14,59,28,852,866


In [28]:
Question_Len = int(df.WordsInQues.max()*4/3)
Answer_Len =  int(df.WordsInAns.max()*4/3)
Chunk_Len =  int(df.WordsInChunk.max()*4/3)
Ques_Chunk_Len = int(df.WordsInQues_Chunk.max()*4/3)
Question_Len, Ques_Chunk_Len, Chunk_Len, Answer_Len # in Tokens

(40, 1361, 1334, 244)

In [21]:
df.describe()

Unnamed: 0,Ques_Id,Chunk_Id,WordsInQues,WordsInAns,WordsInRef,WordsInChunk
count,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0
mean,501.0,591.773679,16.249252,46.737787,39.701894,738.121635
std,289.685462,127.654746,4.355427,16.948379,16.583902,152.447678
min,0.0,389.0,5.0,12.0,1.0,413.0
25%,250.5,484.0,13.0,35.0,28.0,614.0
50%,501.0,567.0,16.0,44.0,37.0,739.0
75%,751.5,700.0,19.0,56.0,49.0,862.0
max,1002.0,834.0,30.0,183.0,105.0,1001.0


## Prepare Dataset and Dataloader

In [29]:
Q_LEN =  1400   # Question Length
T_LEN =  250  # Target Length
BATCH_SIZE = 2
# DEVICE = "cuda:0"

In [30]:
print(df.shape)
df.head(3)

(1003, 12)


Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,Reference,WordsInQues,WordsInAns,WordsInRef,WordsInChunk,WordsInQues_Chunk
0,0,389,Book03_002,What is the significance of performing the Agn...,Performing the Agnihotra is considered importa...,Even this is eternal morality. They that perfo...,The significance of the Agnihotra and the cons...,16,50,50,809,825
1,1,390,Book03_003,"What predicament does Yudhishthira face, and h...",Yudhishthira faces the predicament of being un...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Yudhishthira's predicament and his consultatio...,14,41,53,852,866
2,2,390,Book03_003,What advice does Dhaumya offer to Yudhishthira...,Dhaumya advises Yudhishthira to take refuge in...,"Section III\n""Vaisampayana said, 'Yudhishthira...",Dhaumya's advice to Yudhishthira is provided i...,10,51,30,852,862


# Load Base Model for Finetuning

In [None]:
class QA_Dataset(Dataset):
  def __init__(self, tokenizer, dataframe, q_len, t_len):
      self.tokenizer = tokenizer
      self.q_len = q_len
      self.t_len = t_len
      self.data = dataframe
      self.questions = self.data["Question"]
      self.context = self.data["Chunk"]
      self.answer = self.data['Ref_Answer']

  def __len__(self):
      return len(self.questions)

  def __getitem__(self, idx):
      question = self.questions[idx]
      context = self.context[idx]
      answer = self.answer[idx]

      question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                  truncation=True, pad_to_max_length=True, add_special_tokens=True)
      answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                        truncation=True, pad_to_max_length=True, add_special_tokens=True)

      labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
      labels[labels == 0] = -100

      return {
          "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
          "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
          "labels": labels,
          "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
      }

In [31]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
MODEL.to(DEVICE)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)

NameError: name 'T5TokenizerFast' is not defined

In [None]:
# Dataloader

train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, df, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

# Start finetune (Training)

In [None]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0
epochs = 10

for epoch in range(epochs):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1

    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1

    print(f"{epoch+1}/{epoch} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

# Save/ Load Finetuned (trained) Model

In [None]:
model_path = "/content/drive/MyDrive/HBQA/t5qa_model"
token_path = "/content/drive/MyDrive/HBQA/t5qa_tokenizer"

## Save Finetuned (trained) Model

In [None]:
MODEL.save_pretrained(model_path)
TOKENIZER.save_pretrained(token_path)

# Saved files
"""('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/spiece.model',
'qa_tokenizer/added_tokens.json',
'qa_tokenizer/tokenizer.json')"""

"('qa_tokenizer/tokenizer_config.json',\n 'qa_tokenizer/special_tokens_map.json',\n 'qa_tokenizer/spiece.model',\n'qa_tokenizer/added_tokens.json',\n'qa_tokenizer/tokenizer.json')"

## Load Model from Memory

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the corresponding tokenizer
TOKENIZER = T5Tokenizer.from_pretrained(token_path)

# Load the pre-trained T5 model
MODEL = T5ForConditionalGeneration.from_pretrained(model_path)
MODEL.to(DEVICE)

In [None]:
# input_text = "Translate the following English text to French: 'Hello, how are you?'"

# # Tokenize the input text
# input_ids = tokenizer.encode(input_text, return_tensors='pt')
# input_ids = input_ids.to(DEVICE)

# # Generate translated text
# translated_ids = MODEL.generate(input_ids)

# # Decode the generated IDs back to text
# translated_text = TOKENIZER.decode(translated_ids[0], skip_special_tokens=True)

# print("Translated Text:", translated_text)


# Predict Answers from t5 model

In [None]:
def predict_answer(context, question):

    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    return predicted_answer

## Check Answers

In [None]:
import random
qno=[]
N=5

for i in range(N):
    qno.append(random.randint(0,df.shape[0] ) )

pred_answers=[]
ref_answers=[]
question=[]

for i in qno:
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    question.append(ques)

    print('Question  :', ques)
    print("Ref Answer:", ref_ans)
    print("Pred Ans  :", pred_ans)
    print('--------')

## Predict All Answer & Save Predictions

In [None]:
import random
pred_answers=[]
ref_answers=[]
questions=[]
for i in range(df.shape[0]):
    chunk    = df.iloc[i]['Chunk'],
    ques     = df.iloc[i]['Question']
    ref_ans  = df.iloc[i]['Ref_Answer']

    pred_ans = predict_answer(chunk, ques)

    pred_answers.append(pred_ans)
    ref_answers.append(ref_ans)
    questions.append(ques)

    # print('Question  :', ques)
    # print("Ref Answer:", ref_ans)
    # print("Pred Ans  :", pred_ans)
    # print('--------')
    print(f"Predicting ans for question {df.iloc[i]['Ques_Id']}")
    df.loc[i,'T5Pred_Answer'] = pred_ans

In [None]:
df[['Ques_Id','Ref_Answer','T5_Pred_Answer']].to_csv(datapath + '09.11-t5Predicted_ans.csv')
# df= pd.read_csv(datapath + '11.1-t5Predicted_ans.csv')

# Calculate and Save Metrics - 1

In [None]:
def calculate_score(ref_answer, predicted_answer):

  pred_answer_ids = TOKENIZER.encode(predicted_answer, return_tensors='pt')[0][0]
  pred_answer_ids = pred_answer_ids.to(DEVICE)

  ref_answer_ids = TOKENIZER.encode(ref_answer, return_tensors='pt')[0][0]
  ref_answer_ids = pred_answer_ids.to(DEVICE)

  bleu = evaluate.load("google_bleu")
  bleu_score1  = bleu.compute(predictions=[predicted_answer], references=[ref_answer])

  # squad = evaluate.load("squad")
  glue_qqp = evaluate.load('glue', 'qqp')

  glue_qqp_score1 = glue_qqp.compute(predictions=[pred_answer_ids],
                      references=[ref_answer_ids])

  return bleu_score1, glue_qqp_score1 #squad_score1 #bleu_score1#, squad_score1, glue_score1

In [None]:
df1 = pd.DataFrame(columns = ['Ques_Id','BLEU_Score','GLUE_Acc','GLUE_F1'])

In [None]:
bleu_score=[]
glue_qqp_score=[]
from datasets import load_metric

N= df.shape[0]

for i in range(N):
  # bleu_score1 = calculate_score( df1.iloc[i]['ref_answer'],df1.iloc[i]['pred_answer'])
  ques_id = df.loc[i,'Ques_Id']
  ref_ans  = df.loc[i,'Ref_Answer']
  pred_ans = df.loc[i,'T5_Pred_Answer']

  bleu, glue = calculate_score(ref_ans, pred_ans)
  print(bleu,glue,ques_id)

  bleu_score.append(bleu['google_bleu'])
  glue_qqp_score.append(glue['accuracy'])
  glue_qqp_score.append(glue['f1'])

  df1.loc[i] = (ques_id, bleu['google_bleu'], glue['accuracy'], glue['f1'] )

In [None]:
df1.sample(5)

Unnamed: 0,Ques_Id,BLEU_Score,GLUE_Acc,GLUE_F1
121,10121.0,0.52,1.0,0.0
576,10576.0,0.347826,1.0,0.0
944,10944.0,0.217949,1.0,0.0
1021,11021.0,0.077778,1.0,0.0
921,10921.0,0.088889,1.0,0.0


In [None]:
df1.to_csv(datapath + '09.12-t5Predicted_Ans_Score.csv')

In [None]:
df1[[ 'BLEU_Score', 'GLUE_Acc', 'GLUE_F1']].mean()

BLEU_Score    0.169266
GLUE_Acc      1.000000
GLUE_F1       0.022645
dtype: float64

# Embedding/Vecotrizing Predicted Answer

In [9]:
df_t5Predicted_Ans = pd.read_csv(datapath+"09.11-t5Predicted_ans.csv")
df_t5Predicted_Ans

Unnamed: 0,Ques_Id,Ref_Answer,T5_Pred_Answer
0,10000,The Muni wandered over the earth and wept loud...,The Muni wandered over the earth and weep loud...
1,10001,The Muni asked for a bride from the creatures ...,The Muni asked for a bride from the
2,10002,The snakes that had been set upon Jaratkaru's ...,No Answer
3,10003,"Vasuki took a maiden, who was decked with orna...",Vasuki reacted positively to the Muni's desire...
4,10004,The Rishi hesitated to accept the maiden becau...,The Rishi hesitated to accept the maiden offer...
...,...,...,...
1099,11099,"Duryodhana asked for troops, while Arjuna requ...",Duryodhana asked Arjuna for assistance in the ...
1100,11100,Sanjaya went as an envoy to the Pandavas from ...,The Kauravas were the envoys of the Pandavas.
1101,11101,The name of the parva that describes the estab...,The name of the parva that describes the estab...
1102,11102,The princes confined in the mountain-pass by J...,Wer hat die princes confined in the mountain-p...


In [None]:
!pip install -Uq sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [12]:
%time
T5PredAns_Sentences = df_t5Predicted_Ans.T5_Pred_Answer.tolist()
T5PredAns_Embeddings = model.encode(T5PredAns_Sentences)

T5PredAnsVectors= torch.tensor(T5PredAns_Embeddings, dtype=torch.float).to(DEVICE)

T5PredAnsVec_list = T5PredAnsVectors.tolist()

df_t5Predicted_Ans['T5_AnsVector'] = T5PredAnsVec_list

df_t5Predicted_Ans.to_csv(datapath+'09.11-t5Predicted_AnsVector')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.44 µs


In [13]:
print(df_t5Predicted_Ans.shape)
df_t5Predicted_Ans.head(3)

Unnamed: 0,Ques_Id,Ref_Answer,T5_Pred_Answer,T5_AnsVector
0,10000,The Muni wandered over the earth and wept loud...,The Muni wandered over the earth and weep loud...,"[0.03444298729300499, 0.08067496865987778, 0.1..."
1,10001,The Muni asked for a bride from the creatures ...,The Muni asked for a bride from the,"[-0.06831680238246918, 0.1373669058084488, 0.0..."
2,10002,The snakes that had been set upon Jaratkaru's ...,No Answer,"[-0.030959580093622208, 0.07305379211902618, -..."
3,10003,"Vasuki took a maiden, who was decked with orna...",Vasuki reacted positively to the Muni's desire...,"[-0.08020621538162231, 0.1019870936870575, 0.0..."
4,10004,The Rishi hesitated to accept the maiden becau...,The Rishi hesitated to accept the maiden offer...,"[-0.046037107706069946, 0.12097300589084625, 0..."
...,...,...,...,...
1099,11099,"Duryodhana asked for troops, while Arjuna requ...",Duryodhana asked Arjuna for assistance in the ...,"[-0.03901642560958862, 0.08608739078044891, -0..."
1100,11100,Sanjaya went as an envoy to the Pandavas from ...,The Kauravas were the envoys of the Pandavas.,"[-0.04763362929224968, 0.0347493477165699, -0...."
1101,11101,The name of the parva that describes the estab...,The name of the parva that describes the estab...,"[0.009528553113341331, 0.011773166246712208, -..."
1102,11102,The princes confined in the mountain-pass by J...,Wer hat die princes confined in the mountain-p...,"[-0.028774632140994072, 0.1089869812130928, -0..."


# Calculate and Save Metrics -2

In [15]:
df_t5Predicted_Ans = pd.read_csv(datapath+'09.11-t5Predicted_AnsVector', usecols=['Ques_Id','T5_AnsVector'])

In [13]:
df_ref_ans = pd.read_csv(datapath + '07.2-HBQA_QA_Vector.csv', usecols=['Ques_Id','AnsVector'])

In [16]:
df_combined=df_t5Predicted_Ans.merge(df_ref_ans, on="Ques_Id", how="inner")

In [18]:
print(df_combined.shape)
df_combined.head(3)


(1104, 3)


Unnamed: 0,Ques_Id,T5_AnsVector,AnsVector
0,10000,"[0.03444298729300499, 0.08067496865987778, 0.1...","[0.0017887662397697568, 0.10703910887241364, 0..."
1,10001,"[-0.06831680238246918, 0.1373669058084488, 0.0...","[-0.061754725873470306, 0.13033652305603027, 0..."
2,10002,"[-0.030959580093622208, 0.07305379211902618, -...","[-0.06496541947126389, 0.08738347142934799, 0...."


In [50]:

def calculate_cosine(row):
  # Remove the brackets and split the string by commas
  predAns_vector_values = row['T5_AnsVector'].strip('[]').split(',')
  refAns_vector_values  = row['AnsVector'].strip('[]').split(',')


  # Convert the string values to floats
  predAns_vector_values = [float(value) for value in predAns_vector_values]
  refAns_vector_values  = [float(value) for value in refAns_vector_values]


  # Convert tensor
  predAns_vector_values = torch.tensor(predAns_vector_values).reshape(1, -1)
  refAns_vector_values = torch.tensor(refAns_vector_values).reshape(1, -1)

  # print(predAns_vector_values)
  # print('----')
  # print (refAns_vector_values)
  # Calculate Cosine
  return round(F.cosine_similarity(predAns_vector_values,refAns_vector_values).item(),3)


In [51]:
for rowid in df_combined.index:
  df_combined.loc[rowid,'CosineSim'] = calculate_cosine( df_combined.loc[rowid])

In [53]:
df_combined.CosineSim.mean()

0.6670416666666666

In [None]:
# sst2, mnli, mnli_mismatched, mnli_matched, qnli, rte, wnli, cola,stsb, mrpc, qqp, and hans.

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'mrpc')  # 'mrpc' or 'qqp'
# references = [0, 1]
# predictions = [0, 1]
# results = glue_metric.compute(predictions=predictions, references=references)
# print(results)

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'stsb')
# references = [0., 1., 2., 3., 4., 5.]
# predictions = [-10., -11., -12., -13., -14., -15.]
# results = glue_metric.compute(predictions=predictions, references=references)
# print(results)

In [None]:
# from evaluate import load
# glue_metric = load('glue', 'cola')
# references = [0, 1]
# predictions = [1, 1]
# results = glue_metric.compute(predictions=predictions, references=references)
# results