In [6]:
!pip install --quiet  datasets #to access race dataset
!pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
!pip install --quiet  tqdm     #for progress bars
!pip install --quiet transformers # for t5 model
!pip install --quiet tokenizers  #tokenizers from HuggingFace
!pip install --quiet sentencepiece #subword tokenizer used by T5
!pip install --quiet pytorch-lightning # pytorch wrapper
!pip install --quiet torchtext # text utilities

In [7]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy

In [8]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"

In [9]:
pd.options.display.max_rows , pd.options.display.max_columns  = 100,100

In [10]:
def create_pandas_dataset(data,
                          answer_threshold=512,
                          verbose = False):

  ''' Create a Pandas Dataframe from hugging face dataset.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_long ,count_short = 0 , 0
  result_df  = pd.DataFrame(columns = ['context', 'question','answer'])
  for index,val in enumerate(tqdm(data)):
      passage = val['article']
      question = val['question']
      ans = val['answer']
      answer = val['options'][ord(ans)-65]
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] +[question] + [answer]
          count_short = count_short + 1
  if verbose:
    return (result_df,
            count_long,
            count_short)
  else:
    return result_df

In [11]:
train_dataset = load_dataset('race','high', split='train')
valid_dataset = load_dataset('race', 'high', split='validation')
print(f"Total Train Samples:{len(train_dataset)} , Total Validation Samples:{len(valid_dataset)}")

Found cached dataset race (/Users/dipteshmukherjee/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)
Found cached dataset race (/Users/dipteshmukherjee/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Total Train Samples:62445 , Total Validation Samples:3451


In [12]:
df_train , df_validation = create_pandas_dataset(train_dataset) , create_pandas_dataset(valid_dataset)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")

100%|██████████| 62445/62445 [02:54<00:00, 357.19it/s]
100%|██████████| 3451/3451 [00:02<00:00, 1174.15it/s]


 Total Train Samples:(62445, 3) , Total Validation Samples:(3451, 3)





In [13]:
# Saving data for future use
df_train.to_parquet('train_race.parquet')
df_validation.to_parquet('validation_race.parquet')

In [14]:
import pandas as pd
pd.read_parquet('train_race.parquet', engine='pyarrow')

Unnamed: 0,context,question,answer
0,Last week I talked with some of my students ab...,We can know from the passage that the author w...,teacher
1,Last week I talked with some of my students ab...,Many graduates today turn to cosmetic surgery ...,get an advantage over others in job-hunting
2,Last week I talked with some of my students ab...,"According to the passage, the author believes ...",media are to blame for misleading young people...
3,Last week I talked with some of my students ab...,Which' s the best title for the passage?.,Young Graduates Look to Surgery for Better Jobs
4,"YUZHOU, HENAN -An accident in a central China ...",What could be the best title for this passage?,A Coal Mine Accident in Central China
...,...,...,...
62440,What if I took that big jump on my bike?What's...,"According to the text,the teenager who explore...",have advantages over others.
62441,What if I took that big jump on my bike?What's...,What does the writer want to tell us by taking...,Mice also experience a period to explore the w...
62442,What if I took that big jump on my bike?What's...,What may the text discuss in the next part?,What really goes on in the teenage brain.
62443,"When officials in Richmond, B. C., Canada, tol...",Stephen Covey was doubtful at first because he...,youth crime was too complex for ordinary citiz...


In [15]:
# Creating a Pytorch DataSet for T5 Training and Validation
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [16]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [17]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=512):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path).iloc[:2000,:]
        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)


    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim]


        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe
            passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]

            input_ = f"context: {passage}" # T5 Input format for question answering tasks
            target = f"question: {str(target)} answer: {answer}" # Output format we require
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [18]:
train_path = 'train_race.parquet' # change this accordingly
validation_path = 'validation_race.parquet'
train_dataset = QuestionGenerationDataset(t5_tokenizer,train_path)
validation_dataset = QuestionGenerationDataset(t5_tokenizer,validation_path)

2000it [00:03, 535.83it/s]
2000it [00:03, 534.96it/s]


In [19]:
# Data Sample

train_sample = train_dataset[10] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

context: Understanding the process of making career choices and managing your career is a basic life skill that everyone should understand. Your career decisions have such a profound effect on all aspects of your life. It's important to have the knowledge and resources needed to make smart, informed decisions. Whether you are looking for a new job, aiming to take the next step at your current job or planning your retirement options, you are making career decisions. Using good resources and the guidance of a career counselor can help you to make those decisions well. Many people mistakenly believe that choosing a career is a one-time event that happens some time in early adulthood. However, career management is actually a life-long process, and we continue to make consequential career choices over the years. When people want to take action in their career, career management and job search are about so much more than writing a good resume. If you learn about and act on the following area

In [20]:
# Fine Tuning T5
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

class T5Tuner(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [21]:
model = T5Tuner(t5_model,t5_tokenizer)

trainer = pl.Trainer(max_epochs = 1,accelerator=device)

trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/dipteshmukherjee/Documents/GitHub/t5_Question_Answer/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'QuestionGenerationDataset' on <module '__main__' (built-in)>
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [22]:
# saving the model
!mkdir "t5_tokenizer"
!mkdir "t5_trained_model"
model.model.save_pretrained('t5_trained_model')
t5_tokenizer.save_pretrained('t5_tokenizer')

('t5_tokenizer/tokenizer_config.json',
 't5_tokenizer/special_tokens_map.json',
 't5_tokenizer/spiece.model',
 't5_tokenizer/added_tokens.json')

In [23]:
# Inference / Predictions
trained_model_path = 't5_trained_model'
trained_tokenizer = 't5_tokenizer'
device = 'cpu'

In [24]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

In [25]:
context ="Holi is considered as one of the most revered and celebrated festivals of India and it is celebrated in almost every part of the country. It is also sometimes called as the “festival of love” as on this day people get to unite together forgetting all resentments and all types of bad feeling towards each other. The great Indian festival lasts for a day and a night, which starts in the evening of Purnima or the Full Moon Day in the month of Falgun. It is celebrated with the name Holika Dahan or Choti Holi on first evening of the festival and the following day is called Holi. In different parts of the country it is known with different names. The vibrancy of colors is something that brings in a lot of positivity in our lives and Holi being the festival of colours is actually a day worth rejoicing. Holi is a famous Hindu festival that is celebrated in every part of India with utmost joy and enthusiasm. The ritual starts by lighting up the bonfire one day before the day of Holi and this process symbolizes the triumph of good over the bad. On the day of Holi people play with colours with their friends and families and in evening they show love and respect to their close ones with Abeer."

text = "context: "+context
print(text)

context: Holi is considered as one of the most revered and celebrated festivals of India and it is celebrated in almost every part of the country. It is also sometimes called as the “festival of love” as on this day people get to unite together forgetting all resentments and all types of bad feeling towards each other. The great Indian festival lasts for a day and a night, which starts in the evening of Purnima or the Full Moon Day in the month of Falgun. It is celebrated with the name Holika Dahan or Choti Holi on first evening of the festival and the following day is called Holi. In different parts of the country it is known with different names. The vibrancy of colors is something that brings in a lot of positivity in our lives and Holi being the festival of colours is actually a day worth rejoicing. Holi is a famous Hindu festival that is celebrated in every part of India with utmost joy and enthusiasm. The ritual starts by lighting up the bonfire one day before the day of Holi and

In [26]:
encoding = tokenizer.encode_plus(text,max_length =512,padding='max_length',
                                 truncation = True,
                                 return_tensors="pt").to(device)
print (encoding.keys())
input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

dict_keys(['input_ids', 'attention_mask'])


In [27]:
model.eval()

beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=72, # How long the generated questions should be
    early_stopping=True,
    num_beams=5,
    num_return_sequences=5
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)

Holi is a famous Hindu festival that is celebrated in every part of India
Holi is considered as one of the most revered and celebrated festivals of India
Holi is one of the most revered and celebrated festivals of India
Holi is a famous Hindu festival
Holi


In [28]:
# Deployment Demo
!pip install --quiet gradio==3.9

In [29]:
def get_question(sentence,mdl,tknizer):

  ''' function to generate questions. Takes a sentence,answer,
      model and tokenizer
  '''

  text = "context: {}".format(sentence)
  #print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)
  #print("outputs")
  #print(outs)
  for beam_output in outs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    #print(sent)

  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]
  #print("decs")
  #print(dec)

  Question = dec[0].replace("question:","")
  #print(Question)
  Answer = dec[0].replace("answer:","")
  #print(Answer)
  Question= Question.strip()
  #Answer= Answer.strip()
  #print(Answer)

  return Question, Answer

In [30]:
context = "Donald Trump is an American media personality and businessman who served as the 45th president of the United States."

print("context: ",context)
ques, ans = get_question(context,model,tokenizer)
print ("question: ",ques)
#print ("answer: ",ans)

context:  Donald Trump is an American media personality and businessman who served as the 45th president of the United States.
question:  Donald Trump


In [31]:
import gradio as gr

context = gr.inputs.Textbox(lines=5,placeholder="Enter paragraph/context here...")
# answer = gr.inputs.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.outputs.Textbox( type="auto", label="Question")
answer = gr.outputs.Textbox( type="auto", label="Answer")

def generate_question(context):
  return get_question(context,model,tokenizer)

iface = gr.Interface(
  fn=generate_question,
  inputs=[context],
  outputs=question)
  #outputs=answer)

iface.launch(debug=False,share=True)

TypeError: dataclass_transform() got an unexpected keyword argument 'field_specifiers'