### Finetune the model for QG

This pipeline is based on this [colab tutorial](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=-gJOEe0Ye0di)

This pipeline failed due to CUDA out of memory.

In [7]:
import torch
torch.cuda.is_available()

False

#### load the data

In [1]:
import pickle
with open("enlarged_eval.pickle", "rb") as fp:
    enlarged_eval = pickle.load(fp)

with open("enlarged_finetune.pickle", "rb") as fp:
    enlarged_finetune = pickle.load(fp)

In [2]:
import random
random.seed(0)
random.shuffle(enlarged_finetune)

In [3]:
# data: [context, answer, questions]
train_dataset = {'question':[], 'answer':[], 'context':[]}
for data in enlarged_finetune[:3500]:
    for question in data[2]:
        train_dataset['answer'].append(data[1])
        train_dataset['context'].append(data[0])
        train_dataset['question'].append(question)

valid_dataset = {'question':[], 'answer':[], 'context':[]}
for data in enlarged_finetune[3500:]:
    for question in data[2]:
        valid_dataset['answer'].append(data[1])
        valid_dataset['context'].append(data[0])
        valid_dataset['question'].append(question)

In [4]:
from nlp import Dataset
train_dataset = Dataset.from_dict(train_dataset)
valid_dataset = Dataset.from_dict(valid_dataset)

  from .autonotebook import tqdm as notebook_tqdm


#### set up the config for trainer

In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, EvalPrediction
import numpy as np
import torch
model_name = "google-t5/t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# process the examples in input and target text format and the eos token at the end 

def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answer']
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=50)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        # 'target_ids': target_encodings['input_ids'],
        # 'target_attention_mask': target_encodings['attention_mask']
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

    return encodings

In [7]:

train_dataset = train_dataset.map(add_eos_to_examples)
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples)
valid_dataset = valid_dataset.map(convert_to_features, batched=True)


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

  0%|          | 0/6442 [00:00<?, ?it/s]100%|██████████| 6442/6442 [00:00<00:00, 12531.85it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 7/7 [00:24<00:00,  3.46s/it]
100%|██████████| 2699/2699 [00:00<00:00, 15758.58it/s]
100%|██████████| 3/3 [00:10<00:00,  3.47s/it]


In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [9]:
from transformers import (
    Trainer,Seq2SeqTrainer,
    TrainingArguments, Seq2SeqTrainingArguments,DataCollatorForSeq2Seq,
    DataCollator)

batch_size = 8
model_name = "google-t5/t5-large"
training_args = Seq2SeqTrainingArguments(
    output_dir='/',
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    # fp16=True,
)
data_collator = DataCollatorForSeq2Seq(tokenizer)

# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Initialize our Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     tokenizer=tokenizer,
#     # data_collator=T2TDataCollator()
# )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss


In [None]:
# Evaluation


results = {}
if training_args.do_eval:
    # logger.info("*** Evaluate ***")
    print("*** Evaluate ***")
    eval_output = trainer.evaluate()

    # output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    # with open(output_eval_file, "w") as writer:
        # logger.info("***** Eval results *****")
    print("***** Eval results *****")
    for key in sorted(eval_output.keys()):
        # logger.info("  %s = %s", key, str(eval_output[key]))
        # writer.write("%s = %s\n" % (key, str(eval_output[key])))
        print("  %s = %s", key, str(eval_output[key]))
        print("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)
