# Gerador de Perguntas Dinâmico Utilizando T5 e PTT5 em Português/Inglês 
## Parte 2 - Treinando o modelo para geração automática das perguntas

*Baseado no incrível trabalho realizado por Suraj Patil* -  [Question Generation 
Using Transformers](https://github.com/patil-suraj/question_generation)  

--- 
1.   A preparação do dataset foi gerado no notebook anterior [Preparação dos Dados](https://).
2.   O Tokenizer foi preparado no passado anterior (adição dos tokens especiais).
3.   Utilização da classe Trainer (Hugginface) para treino do modelo.
4.   Tratamento para treino em diferentes modelos e linguas (português ou inglês).
5.   Após o treinamento o modelo é salvo e o diretório faz referencia ao modelo original utilizado para treinamento (modelos T5 em português e inglês).





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/python/question-generator')

In [None]:
!pip install transformers
!pip install nlp
!pip install sentencepiece
!pip install datasets

In [4]:
import dataclasses
import json
import logging
import os
import sys
from datasets import Dataset
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np
import torch

import inspect

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    T5Tokenizer,
    DataCollator,
    TrainingArguments,
    set_seed,
    Trainer
)


from data_collator import T2TDataCollator
from utils import freeze_embeds, assert_not_all_frozen

In [5]:
GDRIVE_PATH = '/content/drive/MyDrive'

LANGUAGE_PORTUGUES = "pt_br"
LANGUAGE_ENGLISH = "en"

DATASET_PREPARED_PATH = os.path.join(GDRIVE_PATH, 'dataset', 'question-generator')
DATASET_PREPARED_PREFIX = 'data_qg_'
DATASET_PREPARED_SUFFIX = '.pt'

MODEL_PTT5_BASE = 'unicamp-dl/ptt5-base-portuguese-vocab'
MODEL_PTT5_LARGE = 'unicamp-dl/ptt5-large-portuguese-vocab'
MODEL_T5_BASE= "t5-base"

TOKENIZER_PATH = os.path.join(GDRIVE_PATH, 'model', 'tokenizer')
OUTPUT_MODEL_ROOT_PATH = os.path.join(GDRIVE_PATH, 'model')

MAX_SOURCE_LENGTH=512
MAX_TARGET_LENGTH=32

In [None]:
#parametros originais para referencia
args_dict = {
    "model_name_or_path": "t5-small",
    "model_type": "t5",
    "tokenizer_name_or_path": "t5_qg_tokenizer",
    "output_dir": "t5-small-qg-hl",
    "train_file_path": "data/train_data_qg_hl_t5.pt",
    "valid_file_path": "data/valid_data_qg_hl_t5.pt",
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "gradient_accumulation_steps": 8,
    "learning_rate": 1e-4,
    "num_train_epochs": 10,
    "seed": 42,
    "do_train": True,
    "do_eval": True,
    "evaluate_during_training": True,
    "logging_steps": 100    
}


In [6]:
# https://huggingface.co/transformers/_modules/transformers/training_args.html
def get_training_arguments(output_model_path: str):
    training_args= TrainingArguments(
        per_device_eval_batch_size= 32,
        gradient_accumulation_steps= 8,
        learning_rate= 1e-4,
        num_train_epochs= 10,
        seed= 42,
        do_train= True,
        do_eval= True,
        logging_steps= 100,
        output_dir= output_model_path,
        prediction_loss_only=True,
        label_smoothing_factor=0
    )
    return training_args

In [7]:
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO 
)
#logger.warning(
#    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
#    training_args.local_rank,
#    training_args.device,
#    training_args.n_gpu,
#    bool(training_args.local_rank != -1),
#    training_args.fp16,
#)
#logger.info("Training/evaluation parameters %s", training_args)

In [8]:
def get_tokenizer(tokenizer_model_path: str):
    tokenizer = T5Tokenizer.from_pretrained(
        tokenizer_model_path
    )   
    return tokenizer

def getModel(model_path_name: str, tokenizer: T5Tokenizer):
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_path_name    
    )

    model.resize_token_embeddings(len(tokenizer))
    return model


In [9]:
def load_datasets(train_file_name: str, validation_file_name: str):
    # Get datasets
    logger.info('loading dataset')

    train_dataset = torch.load(train_file_name)
    validation_dataset = torch.load(validation_file_name)

    logger.info('finished loading dataset')

    return train_dataset, validation_dataset

In [10]:
#signature = inspect.signature(self.model.forward)
#signature_columns = list(signature.parameters.keys())

In [11]:
def prepare_trainer(training_args: TrainingArguments, tokenizer: T5Tokenizer, model, train_dataset: Dataset, validation_dataset: Dataset):
    # Initialize data_collator
    data_collator = T2TDataCollator(
        tokenizer=tokenizer,
        model_type="t5",
        mode="training",
        using_tpu=training_args.tpu_num_cores is not None
    )

    # Initialize our Trainer*
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=data_collator
    )

    return trainer

In [12]:
def train_model(trainer: Trainer, tokenizer: T5Tokenizer, training_args:TrainingArguments):
    # Training
    # Trocar resume_from_checkpoint se já existir treinamento anterior
    if training_args.do_train:
        trainer.train(
            #resume_from_checkpoint=True
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)


In [13]:
model_path = MODEL_T5_BASE #MODEL_PTT5_BASE #MODEL_PTT5_LARGE
model_name = model_path.split(sep="/")[-1]
tokenizer_path = os.path.join(TOKENIZER_PATH, model_name + '-qg-tokenizer')
output_model_path = os.path.join(OUTPUT_MODEL_ROOT_PATH, model_name + '-qg')
train_dataset_filename = os.path.join(DATASET_PREPARED_PATH,
                                      'train_'+ DATASET_PREPARED_PREFIX + model_name + DATASET_PREPARED_SUFFIX)
validation_dataset_filename = os.path.join(DATASET_PREPARED_PATH,
                                           'valid_'+ DATASET_PREPARED_PREFIX + model_name + DATASET_PREPARED_SUFFIX)


tokenizer = get_tokenizer(tokenizer_model_path=tokenizer_path)
model = getModel(model_path_name=model_path, tokenizer=tokenizer)
train_dataset, validation_dataset = load_datasets(train_file_name=train_dataset_filename, 
                                                  validation_file_name=validation_dataset_filename)
training_arguments = get_training_arguments(output_model_path=output_model_path)
trainer: Trainer = prepare_trainer(training_args=training_arguments, tokenizer=tokenizer, model=model, train_dataset= train_dataset, validation_dataset=validation_dataset )

train_model(trainer=trainer, tokenizer=tokenizer, training_args=training_arguments)

07/10/2021 08:11:22 - INFO - filelock -   Lock 140135608182928 acquired on /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…

07/10/2021 08:11:22 - INFO - filelock -   Lock 140135608182928 released on /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637.lock





07/10/2021 08:11:23 - INFO - filelock -   Lock 140135608182928 acquired on /root/.cache/huggingface/transformers/ab4e948915b067f5cb6e5105f6f85044fd717b133f43240db67899a8fc7b29a2.26934c75adf19ceac3c268b721ba353356b7609c45f5627550326f275a2163b4.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…

07/10/2021 08:11:50 - INFO - filelock -   Lock 140135608182928 released on /root/.cache/huggingface/transformers/ab4e948915b067f5cb6e5105f6f85044fd717b133f43240db67899a8fc7b29a2.26934c75adf19ceac3c268b721ba353356b7609c45f5627550326f275a2163b4.lock





07/10/2021 08:11:54 - INFO - __main__ -   loading dataset
07/10/2021 08:11:57 - INFO - nlp.utils.file_utils -   PyTorch version 1.9.0+cu102 available.
07/10/2021 08:11:57 - INFO - nlp.utils.file_utils -   TensorFlow version 2.5.0 available.
07/10/2021 08:11:58 - INFO - __main__ -   finished loading dataset
***** Running training *****
  Num examples = 18896
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 2950


Step,Training Loss
100,2.7811
200,2.0554
300,1.9236
400,1.7825
500,1.7664
600,1.7183
700,1.6599
800,1.6544
900,1.6448
1000,1.5755


Saving model checkpoint to /content/drive/MyDrive/model/t5-base-qg/checkpoint-500
Configuration saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/model/t5-base-qg/checkpoint-1000
Configuration saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/model/t5-base-qg/checkpoint-1500
Configuration saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/model/t5-base-qg/checkpoint-2000
Configuration saved in /content/drive/MyDrive/model/t5-base-qg/checkpoint-2000/config.json
Model weights sa

07/10/2021 10:38:31 - INFO - __main__ -   ***** Eval results *****
07/10/2021 10:38:31 - INFO - __main__ -     epoch = 10.0
07/10/2021 10:38:31 - INFO - __main__ -     eval_loss = 1.589441180229187
07/10/2021 10:38:31 - INFO - __main__ -     eval_runtime = 32.0757
07/10/2021 10:38:31 - INFO - __main__ -     eval_samples_per_second = 64.441
07/10/2021 10:38:31 - INFO - __main__ -     eval_steps_per_second = 2.026


In [None]:
print(results)

{'eval_loss': 1.8666026592254639, 'eval_runtime': 8.732, 'eval_samples_per_second': 236.715, 'epoch': 10.0, 'eval_mem_cpu_alloc_delta': 226281, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 205585, 'eval_mem_gpu_peaked_delta': 1074092032}
