<a href="https://colab.research.google.com/github/debarghaBhattacharjee/NLP-with-Transformers/blob/main/automatic_question_generation/aqg_with_trainer_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[sentencepiece]

In [None]:
!pip install datasets

#### Loading dataset from Hub

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("lmqg/qg_squad")
raw_datasets

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'answer': 'established beliefs or customs',
 'question': 'What is heresy mainly at odds with?',
 'sentence': 'Heresy is any provocative belief or theory that is strongly at variance with established beliefs or customs .',
 'paragraph': "Heresy is any provocative belief or theory that is strongly at variance with established beliefs or customs. A heretic is a proponent of such claims or beliefs. Heresy is distinct from both apostasy, which is the explicit renunciation of one's religion, principles or cause, and blasphemy, which is an impious utterance or action concerning God or sacred things.",
 'sentence_answer': 'Heresy is any provocative belief or theory that is strongly at variance with <hl> established beliefs or customs <hl> .',
 'paragraph_answer': "Heresy is any provocative belief or theory that is strongly at variance with <hl> established beliefs or customs <hl>. A heretic is a proponent of such claims or beliefs. Heresy is distinct from both apostasy, which is the explicit 

In [None]:
raw_train_dataset.features

{'answer': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'sentence': Value(dtype='string', id=None),
 'paragraph': Value(dtype='string', id=None),
 'sentence_answer': Value(dtype='string', id=None),
 'paragraph_answer': Value(dtype='string', id=None),
 'paragraph_sentence': Value(dtype='string', id=None)}

#### Preprocessing a dataset

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
checkpoint = "lmqg/t5-small-squad"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
inputs = tokenizer("this is the first sentence", "this is the second sentence")
inputs

{'input_ids': [48, 19, 8, 166, 7142, 1, 48, 19, 8, 511, 7142, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['▁this',
 '▁is',
 '▁the',
 '▁first',
 '▁sentence',
 '</s>',
 '▁this',
 '▁is',
 '▁the',
 '▁second',
 '▁sentence',
 '</s>']

In [None]:
def tokenize_function(example):
    return tokenizer(
        text=example["sentence_answer"], 
        text_target=example["question"],
        truncation=True
    )

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

  0%|          | 0/76 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 75722
    })
    validation: Dataset({
        features: ['answer', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
    test: Dataset({
        features: ['answer', 'question', 'sentence', 'paragraph', 'sentence_answer', 'paragraph_answer', 'paragraph_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11877
    })
})

#### Fine-tuning with trainer API

In [None]:
!pip install huggingface-hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size=32
nb_train_epochs=10

# Show the training loss at end of every epoch.
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = checkpoint.split("/")[-1]
print(model_name)

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=nb_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import torch
import evaluate

In [None]:
bleu_score = evaluate.load("bleu")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print(f"predictions: \n{predictions}")
    # Decode the generated question into text.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # print(f"decoded_preds: \n{decoded_preds}")
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)
    # print(f"labels: \n{labels}")
    # Decode the reference questions into text.
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[decoded_label] for decoded_label in decoded_labels]
    # print(f"decoded_labels: \n{decoded_labels}")
    # Compute BLEU scores.
    result = bleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    # print(f"result: {result}")
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: paragraph_sentence, question, sentence_answer, paragraph, answer, paragraph_answer, sentence. If paragraph_sentence, question, sentence_answer, paragraph, answer, paragraph_answer, sentence are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 75722
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 23670
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.9131,1.895996,0.186059,"[0.5106522785325583, 0.24527605096325475, 0.15213089101620028, 0.09826831888082575]",0.894432,0.899631,108052,120107
2,1.849,1.880572,0.18493,"[0.5080246970549962, 0.24224047124755838, 0.1501267012945318, 0.09769844997651479]",0.897198,0.902137,108353,120107
3,1.8168,1.872697,0.18539,"[0.5098220476080425, 0.24339941601352388, 0.15093927730223472, 0.09804485712417446]",0.895563,0.900655,108175,120107
4,1.7923,1.869982,0.186255,"[0.5133830790362698, 0.24615653748790878, 0.15267642711989654, 0.09868749835608512]",0.891631,0.8971,107748,120107
5,1.7748,1.868894,0.186901,"[0.5141749342160318, 0.24699161674176884, 0.1534446643289472, 0.0998958319598096]",0.889794,0.895443,107549,120107
6,1.7587,1.869837,0.186411,"[0.5146328972484753, 0.24659953524399691, 0.1532201031824242, 0.09970271520116271]",0.88837,0.894161,107395,120107
7,1.7468,1.868036,0.186049,"[0.5112671501824734, 0.24460144371064352, 0.15145530742292898, 0.09809866056844169]",0.896114,0.901155,108235,120107
8,1.7378,1.86696,0.187567,"[0.5122261914652045, 0.24610537728997678, 0.15275308797724588, 0.09927828458817849]",0.897014,0.901971,108333,120107
9,1.7312,1.867626,0.187558,"[0.5117292997446746, 0.24587669400218548, 0.15249172858304044, 0.0989853996535511]",0.898446,0.90327,108489,120107
10,1.7271,1.866812,0.187299,"[0.5110525491352382, 0.245362761211552, 0.15215077757561193, 0.09884530767928974]",0.898794,0.903586,108527,120107


Saving model checkpoint to t5-small-squad-finetuned/checkpoint-500
Configuration saved in t5-small-squad-finetuned/checkpoint-500/config.json
Model weights saved in t5-small-squad-finetuned/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-small-squad-finetuned/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-small-squad-finetuned/checkpoint-500/special_tokens_map.json
Copy vocab file to t5-small-squad-finetuned/checkpoint-500/spiece.model
tokenizer config file saved in t5-small-squad-finetuned/tokenizer_config.json
Special tokens file saved in t5-small-squad-finetuned/special_tokens_map.json
Copy vocab file to t5-small-squad-finetuned/spiece.model
Saving model checkpoint to t5-small-squad-finetuned/checkpoint-1000
Configuration saved in t5-small-squad-finetuned/checkpoint-1000/config.json
Model weights saved in t5-small-squad-finetuned/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-small-squad-finetuned/checkpoint-1000/tokenize

TrainOutput(global_step=23670, training_loss=1.7847247747011108, metrics={'train_runtime': 7200.3933, 'train_samples_per_second': 105.164, 'train_steps_per_second': 3.287, 'total_flos': 2.3301590188621824e+16, 'train_loss': 1.7847247747011108, 'epoch': 10.0})

In [None]:
trainer.push_to_hub()

In [None]:
"""
Use this code segment to zip and download the directly from Colab to your local machine.
"""
# !zip -r "t5-small-squad-finetuned.zip" t5-small-squad-finetuned

# from google.colab import files
# files.download("t5-small-squad-finetuned.zip")

'\nUse this code segment to zip and download the directly from Colab to your local machine.\n'