In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install nlp
!pip install sentencepiece



In [1]:
import dataclasses
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np
import torch

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    T5Tokenizer,
    BartTokenizer,
    HfArgumentParser,
    DataCollator,
    TrainingArguments,
    set_seed,
    Trainer
)


from data_collator import T2TDataCollator
from utils import freeze_embeds, assert_not_all_frozen

In [6]:
GDRIVE_PATH = '/content/drive/MyDrive'
#TOKENIZER_PATH = os.path.join(GDRIVE_PATH, 'tokenizer/question-generator/t5_qg_tokenizer')
TOKENIZER_PATH = os.path.join('model', 't5_qg_tokenizer')
#DATASET_PATH = os.path.join(GDRIVE_PATH,'dataset', 'question-generator')
DATASET_PATH = 'data'

MODEL_TYPE_TO_TOKENIZER = {
    "t5": T5Tokenizer,
    "bart": BartTokenizer,
}

In [7]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    model_type: str = field(metadata={"help": "One of 't5', 'bart'"})
    tokenizer_name_or_path: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
    label_smoothing: Optional[float] = field(
        default=0,
        metadata={"help": "label smoothing rate, set to > 0 if you want to enable lable smoothing"}
    )
    freeze_embeds: bool = field(
        default=False,
        metadata={"help": "Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."}
    )

In [8]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: str = field(
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: str = field(
        metadata={"help": "Path for cached valid dataset"},
    )
    data_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Path for data files"}, 
    )
    task: Optional[str] = field(
        default=None,
        metadata={"help": "Which task 'qa', 'qg', 'e2e_qg', 'ans_ext', 'multi'. 'multi' means 'qa', 'qg', 'ans_ext' tasks"}, 
    )
    qg_format: Optional[str] = field(
        default='prepend_qg_format',
        metadata={"help": "How to format inputs for que generation, 'highlight_qg_format' or 'prepend_qg_format'"}, 
    )
    max_source_length: Optional[int] = field(
        default=512,
        metadata={"help": "Max input length for the source text"},
    )
    max_target_length: Optional[int] = field(
        default=32,
        metadata={"help": "Max input length for the target text"},
    )

In [9]:
args_dict = {
    "model_name_or_path": "t5-small",
    "model_type": "t5",
    "tokenizer_name_or_path": "t5_qg_tokenizer",
    "output_dir": "t5-small-qg-hl",
    "train_file_path": "data/train_data_qg_hl_t5.pt",
    "valid_file_path": "data/valid_data_qg_hl_t5.pt",
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "gradient_accumulation_steps": 8,
    "learning_rate": 1e-4,
    "num_train_epochs": 10,
    "seed": 42,
    "do_train": True,
    "do_eval": True,
    "evaluate_during_training": True,
    "logging_steps": 100    
}


In [10]:
model_args = ModelArguments(
    model_name_or_path="t5-small",
    model_type="t5",
    tokenizer_name_or_path=TOKENIZER_PATH
)

In [14]:
# https://huggingface.co/transformers/_modules/transformers/training_args.html
training_args= TrainingArguments(
    per_device_eval_batch_size= 32,
    gradient_accumulation_steps= 8,
    learning_rate= 1e-4,
    num_train_epochs= 10,
    seed= 42,
    do_train= True,
    do_eval= True,
    logging_steps= 100,
    output_dir= 'model/t5-small-qg-hl',
    prediction_loss_only=True,
    label_smoothing_factor=model_args.label_smoothing
)

In [15]:
data_args = DataTrainingArguments(
    train_file_path= os.path.join(DATASET_PATH,'train_data_e2e_qg_t5.pt'),
    valid_file_path= os.path.join(DATASET_PATH,'valid_data_e2e_qg_t5.pt'),
    task="e2e_qg"
)

In [16]:
 if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

In [17]:
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

06/28/2021 06:04:52 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=model/t5-small-qg-hl, overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=True, per_device_train_batch_size=8, per_device_eval_batch_size=32, gradient_accumulation_steps=8, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs\Jun28_06-04-24_Colosso-AI, logging_strategy=IntervalStrategy.STEPS, logging_first_step=False, logging_steps=100, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_d

In [18]:
set_seed(training_args.seed)

In [19]:
tokenizer_cls = MODEL_TYPE_TO_TOKENIZER[model_args.model_type]
tokenizer = tokenizer_cls.from_pretrained(
    model_args.tokenizer_name_or_path if model_args.tokenizer_name_or_path else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)

06/28/2021 06:05:15 - INFO - filelock -   Lock 1861264265520 acquired on C:\Users\gdutr/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985.lock
Downloading: 100%|██████████| 1.20k/1.20k [00:00<00:00, 1.20MB/s]
06/28/2021 06:05:15 - INFO - filelock -   Lock 1861264265520 released on C:\Users\gdutr/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985.lock
06/28/2021 06:05:16 - INFO - filelock -   Lock 1861264265520 acquired on C:\Users\gdutr/.cache\huggingface\transformers\fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885.lock
Downloading: 100%|██████████| 242M/242M [00:21<00:00, 11.1MB/s]
06/28/2021 06:05:38 - INFO - filelock -   Lock 1861264265520 released on C:\Users\gdutr/.cache\huggingfac

In [20]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32102, 512)

In [21]:
if model_args.freeze_embeds:
        logger.info("freezing embeddings of the model")
        freeze_embeds(model)
        assert_not_all_frozen(model)

# Get datasets
logger.info('loading dataset')

train_dataset = torch.load(data_args.train_file_path) if training_args.do_train else None
valid_dataset = torch.load(data_args.valid_file_path) if training_args.do_eval else None

logger.info('finished loading dataset')

06/28/2021 06:09:54 - INFO - __main__ -   loading dataset
06/28/2021 06:09:54 - INFO - nlp.utils.file_utils -   PyTorch version 1.8.1 available.
06/28/2021 06:09:54 - INFO - nlp.utils.file_utils -   TensorFlow version 2.4.0 available.
06/28/2021 06:09:55 - INFO - __main__ -   finished loading dataset


In [22]:
# Initialize data_collator
data_collator = T2TDataCollator(
    tokenizer=tokenizer,
    model_type=model_args.model_type,
    mode="training",
    using_tpu=training_args.tpu_num_cores is not None
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator
)

In [23]:
# Training
if training_args.do_train:
    trainer.train(
        model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
    )
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_process_zero():
        tokenizer.save_pretrained(training_args.output_dir)

# Evaluation
results = {}
if training_args.do_eval and training_args.local_rank in [-1, 0]:
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(eval_output.keys()):
            logger.info("  %s = %s", key, str(eval_output[key]))
            writer.write("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)


  3%|▎         | 100/2950 [01:18<35:21,  1.34it/s]{'loss': 3.7416, 'learning_rate': 9.661016949152543e-05, 'epoch': 0.34}
  7%|▋         | 200/2950 [02:37<37:19,  1.23it/s]{'loss': 2.6594, 'learning_rate': 9.322033898305085e-05, 'epoch': 0.68}
 10%|█         | 300/2950 [03:56<33:42,  1.31it/s]{'loss': 2.5182, 'learning_rate': 8.983050847457629e-05, 'epoch': 1.02}
 14%|█▎        | 400/2950 [05:15<32:23,  1.31it/s]{'loss': 2.4076, 'learning_rate': 8.644067796610171e-05, 'epoch': 1.36}
 17%|█▋        | 500/2950 [06:34<32:13,  1.27it/s]{'loss': 2.3598, 'learning_rate': 8.305084745762712e-05, 'epoch': 1.69}
 20%|██        | 600/2950 [07:53<30:45,  1.27it/s]{'loss': 2.3152, 'learning_rate': 7.966101694915254e-05, 'epoch': 2.03}
 24%|██▎       | 700/2950 [09:13<30:29,  1.23it/s]{'loss': 2.2555, 'learning_rate': 7.627118644067796e-05, 'epoch': 2.37}
 27%|██▋       | 800/2950 [10:31<29:48,  1.20it/s]{'loss': 2.2309, 'learning_rate': 7.288135593220338e-05, 'epoch': 2.71}
 31%|███       | 900/295

In [24]:
print(results)

{'eval_loss': 1.8666026592254639, 'eval_runtime': 8.732, 'eval_samples_per_second': 236.715, 'epoch': 10.0, 'eval_mem_cpu_alloc_delta': 226281, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 205585, 'eval_mem_gpu_peaked_delta': 1074092032}
