In [None]:
!pip install transformers -q

In [None]:
!pip install evaluate

In [None]:
!pip install SentencePiece

# Load Saved model directly

In [None]:
import tensorflow as tf

# Detect and initialize TPU
tpu_available = tf.config.experimental.list_logical_devices("TPU")
if tpu_available:
    print("TPU available")
else:
    print("No TPU available")

In [None]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

In [None]:
DEVICE

In [None]:
# sample code
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# TOKENIZER = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-ReformerForQuestionAnswering")
# MODEL.to(DEVICE)

# Loading Training Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Booksummary/qa_merged.csv')

In [None]:
df

In [None]:
Question_Len = max(len(ques) for ques in df.question)
Answer_Len = max(len(ans) for ans in df.answer)
Question_Len,Answer_Len

# Tokenizer

In [None]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [None]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

# Prepare Dataset and Dataloader

In [None]:
Q_LEN = Question_Len # 256   # Question Length
T_LEN = Answer_Len #32  # Target Length
BATCH_SIZE = 2
DEVICE = "cuda:0"

In [None]:
# Dataloader

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

# Load Base Model for Finetuning

In [None]:
# TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
# MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
# MODEL.to(DEVICE)
# OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)

# Start finetune (Training)

In [None]:
# train_loss = 0
# val_loss = 0
# train_batch_count = 0
# val_batch_count = 0

# for epoch in range(2):
#     MODEL.train()
#     for batch in tqdm(train_loader, desc="Training batches"):
#         input_ids = batch["input_ids"].to(DEVICE)
#         attention_mask = batch["attention_mask"].to(DEVICE)
#         labels = batch["labels"].to(DEVICE)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

#         outputs = MODEL(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         OPTIMIZER.zero_grad()
#         outputs.loss.backward()
#         OPTIMIZER.step()
#         train_loss += outputs.loss.item()
#         train_batch_count += 1

#     #Evaluation
#     MODEL.eval()
#     for batch in tqdm(val_loader, desc="Validation batches"):
#         input_ids = batch["input_ids"].to(DEVICE)
#         attention_mask = batch["attention_mask"].to(DEVICE)
#         labels = batch["labels"].to(DEVICE)
#         decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

#         outputs = MODEL(
#                           input_ids=input_ids,
#                           attention_mask=attention_mask,
#                           labels=labels,
#                           decoder_attention_mask=decoder_attention_mask
#                         )

#         OPTIMIZER.zero_grad()
#         outputs.loss.backward()
#         OPTIMIZER.step()
#         val_loss += outputs.loss.item()
#         val_batch_count += 1

#     print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

# Save Finetuned (trained) Model

In [None]:
# model_path = "/content/drive/MyDrive/Booksummary/t5qa_model"
# token_path = "/content/drive/MyDrive/Booksummary/t5qa_tokenizer"

# MODEL.save_pretrained(model_path)
# TOKENIZER.save_pretrained(token_path)

# # Saved files
# """('qa_tokenizer/tokenizer_config.json',
#  'qa_tokenizer/special_tokens_map.json',
#  'qa_tokenizer/spiece.model',
# 'qa_tokenizer/added_tokens.json',
# 'qa_tokenizer/tokenizer.json')"""

# Load Model from Memory

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Replace 'model_path' with the path to your saved model directory
model_path = r"/content/drive/MyDrive/Booksummary/t5qa_model"
token_path = r"/content/drive/MyDrive/Booksummary/t5qa_tokenizer"

# Load the corresponding tokenizer
TOKENIZER = T5Tokenizer.from_pretrained(token_path)


# Load the pre-trained T5 model
MODEL = T5ForConditionalGeneration.from_pretrained(model_path)
MODEL.to(DEVICE)

In [None]:
# input_text = "Translate the following English text to French: 'Hello, how are you?'"

# # Tokenize the input text
# input_ids = tokenizer.encode(input_text, return_tensors='pt')
# input_ids = input_ids.to(DEVICE)

# # Generate translated text
# translated_ids = MODEL.generate(input_ids)

# # Decode the generated IDs back to text
# translated_text = TOKENIZER.decode(translated_ids[0], skip_special_tokens=True)

# print("Translated Text:", translated_text)


# Predict Answers from t5 model

In [None]:
def predict_answer(context, question):

    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    return predicted_answer




In [None]:
pred_answers=[]
ref_answers=[]
question=[]
N = 5#len(data)
for i in range(N):
  pred_ans = predict_answer(data.iloc[i]['context'],data.iloc[i]['question'])
  pred_answers.append(pred_ans)
  ref_answers.append(data.iloc[i]['answer'])
  question.append(data.iloc[i]['question'])
  print(pred_ans)

In [None]:
prediction_path =r'/content/drive/MyDrive/Booksummary/t5predicted_ans.csv'

In [None]:
# df1 = pd.DataFrame({"question": question, "ref_answer": ref_answers, "pred_answer": pred_answers})
# df1.index = data.index

# val_data_index = val_data.index

# df1['train_data']=True

# for i in val_data_index:
#   df1.loc[i,"train_data"]=False

# df1.to_csv(prediction_path)

In [None]:
df1 = pd.read_csv(prediction_path)
df1

In [None]:
idx=[10,100,201]
for i in idx:
  pred_ans = predict_answer(data.iloc[i]['context'],data.iloc[i]['question'])
  print("Question: " + data.iloc[i]['question'])
  print("Refrence Ans: "+ data.iloc[i]['answer'])
  print("Predicted Answer: ", pred_ans)

In [None]:
def calculate_score(ref_answer, predicted_answer):

  # bleu = evaluate.load("google_bleu")
  # bleu_score1  = bleu.compute(predictions=[predicted_answer], references=[ref_answer])

  pred_answer_ids = TOKENIZER.encode(predicted_answer, return_tensors='pt')[0][0]
  pred_answer_ids = pred_answer_ids.to(DEVICE)

  ref_answer_ids = TOKENIZER.encode(ref_answer, return_tensors='pt')[0][0]
  ref_answer_ids = pred_answer_ids.to(DEVICE)

  # squad = evaluate.load("squad")
  glue_qqp = evaluate.load('glue', 'qqp')


  glue_qqp_score1 = glue_qqp.compute(predictions=[pred_answer_ids],
                      references=[ref_answer_ids])



  return glue_qqp_score1 #squad_score1 #bleu_score1#, squad_score1, glue_score1

In [None]:
bleu_score=[]
squad_score=[]
glue_qqp_score=[]
from datasets import load_metric

N= 5 #len(df1)

for i in range(N):
  # bleu_score1 = calculate_score( df1.iloc[i]['ref_answer'],df1.iloc[i]['pred_answer'])
  ref_ans = df1.iloc[i]['ref_answer'],
  pred_ans = df1.iloc[i]['pred_answer']
  print(ref_ans, '\n\n', pred_ans)


  pred_answer_ids = TOKENIZER.encode(pred_ans, return_tensors='pt')[0][0]
  pred_answer_ids = pred_answer_ids.to(DEVICE)

  ref_answer_ids = TOKENIZER.encode(ref_ans, return_tensors='pt')[0][0]
  ref_answer_ids = pred_answer_ids.to(DEVICE)


  # glue_qqp_score1 = calculate_score(ref_ans,  pred_ans)
  glue_qqp = evaluate.load('glue', 'qqp')

  glue_qqp_score1 = glue_qqp.compute(predictions=[pred_answer_ids],
                      references=[ref_answer_ids])

  glue_qqp_score.append(glue_qqp_score1)



  squad_metric = load_metric("squad_v2")
  squad_score = squad_metric.compute(predictions=pred_answer_ids, references=ref_answer_ids)

  squad_score.append(squad_score1)

  # glue_score.append(glue_score1)



In [None]:
glue_qqp_score

In [None]:
TOKENIZER.encode(pred_ans, return_tensors='pt')[0]

In [None]:
# predict_answer(data.iloc[i]['context'],data.iloc[i]['question'],data.iloc[i]['answer'],)

In [None]:
df1 = pd.DataFrame({"question": question, "ref_answer": ref_answers, "pred_answer": pred_answers,  "bleu_score":bleu_score})
df1.index = data.index

val_data_index = val_data.index

df1['train_data']=True

for i in val_data_index:
  df1.loc[i,"train_data"]=False

In [None]:
df1.to_csv('/content/drive/MyDrive/Booksummary/t5predicted_ans.csv')

In [None]:
df1

In [None]:
def extract_numeric(dic):
    return next(iter(dic.values()))

df1['bleu_score'] = df1.bleu_score.apply(extract_numeric)

In [None]:
df1

In [None]:
# sst2, mnli, mnli_mismatched, mnli_matched, qnli, rte, wnli, cola,stsb, mrpc, qqp, and hans.

In [None]:
from evaluate import load
glue_metric = load('glue', 'mrpc')  # 'mrpc' or 'qqp'
references = [0, 1]
predictions = [0, 1]
results = glue_metric.compute(predictions=predictions, references=references)
print(results)

In [None]:
from evaluate import load
glue_metric = load('glue', 'stsb')
references = [0., 1., 2., 3., 4., 5.]
predictions = [-10., -11., -12., -13., -14., -15.]
results = glue_metric.compute(predictions=predictions, references=references)
print(results)

In [None]:
from evaluate import load
glue_metric = load('glue', 'cola')
references = [0, 1]
predictions = [1, 1]
results = glue_metric.compute(predictions=predictions, references=references)
results