In [None]:
# check for the GPU provided in the runtime
!nvidia-smi

In [None]:
# using quiet method for controlling the log
# for suppressing the colored errors and warning in the terminal
!pip install --quiet transformers==4.1.1
# pytorch lightning for smoother model training and data loading
#!pip install --quiet https://github.com/PyTorchLightning/pytorch-lightning/releases/download/1.2.6/pytorch-lightning-1.2.6.tar.gz 
!pip install -q pytorch-lightning  
# using HuggingFace tokenizers
!pip install --quiet tokenizers==0.9.4
# Google's sentencepiece
!pip install --quiet sentencepiece==0.1.94

In [None]:
# argparse makes it easier to write user friendly command line interfaces
import argparse
# package for faster file name matching
import glob
# makiing directories for data 
import os
# reading json files as the data is present in json files
import json
# time module for calculating the model runtime
import time
# Allows writing status messages to a file
import logging
# generate random float numbers uniformly
import random
# regex module for text 
import re
# module provides various functions which work on 
# iterators too produce complex iterators
from itertools import chain
from string import punctuation

# pandas for data manipulation
import pandas as pd
# numpy for array operations
import numpy as np
# PyTorch
import torch
# provides various classes representing file system paths
# with appropriate semantics
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

# splitting the data 
from sklearn.model_selection import train_test_split
# ANSII color formatting for ouput in terminal
from termcolor import colored
# wrapping paragraphs into string
import textwrap

# model checkpoints in pretrained model
from pytorch_lightning.callbacks import ModelCheckpoint

'''
optimizer - AdamW
T5 Conditional Generator in which we'll give conditions
T5 tokenizer because it is fast
training the model without a learning rate
'''
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
# Seeds all the processes including numpy torch and other imported modules.
pl.seed_everything(0)

In [None]:
# check the version provided by Lightning
import pytorch_lightning as pl
print(pl.__version__)

In [None]:
import json

with open('/content/pqa_train.json', 'r') as f:
  data = json.load(f)



In [None]:
data['data'][1].keys()

In [None]:
# len 
len(data['data'])

In [None]:
# We have a list of dictionaries in the "data". We can explore the 0th element
data['data'][0].keys()

In [None]:
data['data'][1]['title']

In [None]:
len(data['data'][0]['paragraphs'])

In [None]:
questions = data['data'][1]['paragraphs']

In [None]:
# datapoint sample
questions[0]

# Function to Create a pandas dataframes of questions and answers

In [None]:
def extract_questions_and_answers(factoid_path ):
  with factoid_path.open() as json_file:
    data = json.load(json_file)
    data_rows = []

    for i in range(len(data['data'])):
      #print(data['data'][i]['title'])
      questions = data['data'][i]['paragraphs']
      
      for question in questions:
        context = question['context']
        for question_and_answers in question['qas']:
          question = question_and_answers['question']
          #print(question)
          answers = question_and_answers['answers']
          for answer in answers:
            answer_text = answer['text']
            answer_start = answer['answer_start']
            answer_end = answer['answer_start'] + len(answer_text)  #Gets the end index of each answer in the paragraph
            
            data_rows.append({
                  "question" : question,
                  "context"  : context,
                  "answer_text" : answer_text,
                  "answer_start" : answer_start,
                  "answer_end" : answer_end
              })
            #print(len(data_rows))
  
  return pd.DataFrame(data_rows)

In [None]:
factoid_path = Path("/content/pqa_train.json")
df = extract_questions_and_answers(factoid_path)
df.head(10)

In [None]:
df.shape

In [None]:
sample_question = df.iloc[243]
sample_question

In [None]:
# Using textcolor to visualize the answer within the context

def color_answer(question):
  answer_start, answer_end = question["answer_start"],question["answer_end"]
  context = question['context']

  return  colored(context[:answer_start], "white") + \
    colored(context[answer_start:answer_end + 1], "green") + \
    colored(context[answer_end+1:], "white")


In [None]:
print(sample_question['question'])
print()
print("Answer: ")
for wrap in textwrap.wrap(color_answer(sample_question), width = 100):
  print(wrap)

# Tokenization

In [None]:
# using the base T5 model having 222M params
MODEL_NAME ='t5-base'

In [None]:
sample_question['context']

In [None]:
'''tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens(['!',
 '-',
 ' ',
 '_',
 '؛',
 '؟',
 'ء',
 'آ',
 'ئ',
 'ا',
 'ب',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 '٪',
 'پ',
 'چ',
 'ژ',
 'ک',
 'گ',
 'ی',
 '۰',
 '۱',
 '۲',
 '۳',
 '۴',
 '۵',
 '۶',
 '۷',
 '۸',
 '۹'])
#model.resize_token_embeddings(len(tokenizer))'''


In [None]:
from transformers import BertConfig, BertTokenizer

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'



tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)


In [None]:
sample_comment = "ما در هوش‌واره معتقدیم با انتقال صحیح دانش و آگاهی، همه افراد میتوانند از ابزارهای هوشمند استفاده کنند. شعار ما هوش مصنوعی برای همه است."
#tokenizer.tokenize(text)
tokens = tokenizer.tokenize(sample_comment)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  Comment: {sample_comment}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

In [None]:
pred_translated = [
         tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for gen_id in token_ids
]
print(pred_translated)

In [None]:
sample_encoding = tokenizer(f"{sample_question['context']}")

In [None]:
sample_encoding.keys()

In [None]:
print(sample_encoding["input_ids"])

In [None]:
print(sample_encoding["attention_mask"])

In [None]:
print(len(sample_encoding['input_ids']), len(sample_encoding['attention_mask']))

In [None]:
# Checking the decoding of the input ids

preds = [
         tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_encoding['input_ids']
]

In [None]:
preds= " ".join(preds)
for wrap in textwrap.wrap(preds, width = 80):
  print(wrap)

There exists a special seperator token in between the question and its answers.

Checking the encoding on the sample question

In [None]:
encoding = tokenizer(
    sample_question['question'],
    sample_question['context'],
    max_length=396,
    padding='max_length',
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)

In [None]:
encoding.keys()

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.eos_token, tokenizer.eos_token_id
# Input id of 1 represents end of sequence token.

In [None]:
# Text representation pf the input ids

tokenizer.decode(encoding['input_ids'].squeeze())

## Creating the labels for the answers

In [None]:
answer_encoding = tokenizer(
    sample_question['answer_text'],
    max_length=32,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)

In [None]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

In [None]:
labels = answer_encoding["input_ids"]
labels

Labels after the end of sequence in the answer encoding has to be converted to -100 from 0 for the model evaluation.

In [None]:
labels[labels == 0] = -100

In [None]:
labels

## To create dataset

In [None]:
class QADataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,

      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = tokenizer(
      data_row['answer_text'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['question'],
        context=data_row['context'],
        answer_text=data_row['answer_text'],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )




In [None]:
sample_dataset = QADataset(df, tokenizer)

In [None]:
for data in sample_dataset:
  print("Question: ", data['question'])
  print("Answer text: ", data['answer_text'])
  print("Input_ids: ", data['input_ids'][:10])
  print("Labels: ", data['labels'][:10])
  break

## Splitting into train and validation sets

In [None]:
train_df, val_df = train_test_split(df, test_size=0.05)

In [None]:
train_df.shape,  val_df.shape

# Create pytorch lightning datamodule

In [None]:
class DataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer:T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,
      ):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self, stage = None):
    self.train_dataset = QADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
        )

    self.test_dataset = QADataset(
    self.test_df,
    self.tokenizer,
    self.source_max_token_len,
    self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
        )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        num_workers=4
        )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
        )

In [None]:
BATCH_SIZE = 4
N_EPOCHS = 3

data_module = DataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

## Building the PyTorch lightning module using T5ForConditionalGeneration model

In [None]:
class QAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
    self.model.resize_token_embeddings(len(tokenizer))

    


  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids, 
        attention_mask=attention_mask,
        labels=labels)

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions":outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=0.0001)
    return optimizer

In [None]:
model = QAModel() 

## Using trainer from pytorch lightning to finetune model using our dataset

In [None]:
# To record the best performing model using checkpoint

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
#logger = TensorBoardLogger("training-logs", name="qa")

In [None]:
#logger = TensorBoardLogger("training-logs", name="qa")
trainer = pl.Trainer(
    #logger = logger,
    callbacks= checkpoint_callback,
    max_epochs=5,
    gpus=1,
    #progress_bar_refresh_rate = 30
)

## Loading Tensorboard

In [None]:
#%load_ext tensorboard

In [None]:
#%tensorboard --logdir ./lightning_logs

In [None]:
#!rm --rf lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
#trainer.test()  # evaluate the model according to the last checkpoint

# Predictions

In [None]:
!ls -sh /content/checkpoints/best-checkpoint.ckpt

In [None]:
#trained_model = QAModel.load_from_checkpoint("checkpoints/best-checkpoint.ckpt")
#trained_model.freeze() # 

## Generate answers for the questions in the validation set

In [None]:
trained_model = model

In [None]:
def generate_answer(question):
  source_encoding=tokenizer(
      question["question"],
      question['context'],
      max_length = 396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=False,
      return_tensors="pt"

  )

  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,  # greedy search
      max_length=80,
      repetition_penalty=2.5,
      early_stopping=True,
      use_cache=True)
  
  preds = [
          tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          for generated_id in generated_ids
  ]

  return "".join(preds)

In [None]:
sample_question = val_df.iloc[20]

In [None]:
sample_question["question"]

In [None]:
sample_question["answer_text"]  # Label Answer

In [None]:
print(generate_answer(sample_question))  # Predicted answer

In [None]:
sample_question = val_df.iloc[66]
sample_question["question"]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)    

In [None]:
sample_question = val_df.iloc[114]
sample_question["question"]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)

In [None]:
sample_question = val_df.iloc[10]
sample_question["question"]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)

In [None]:
sample_question = val_df.iloc[77]
sample_question["question"]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)