In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install nlp
!pip install sentencepiece

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 14.1MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 42.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████

In [None]:
import os
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import torch
import nlp
from transformers import T5Tokenizer, BartTokenizer, HfArgumentParser

In [None]:
logger = logging.getLogger(__name__)
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )


In [None]:
def filter_qa(example):
    return example['task'] == 'qa'

def filter_qg(example):
    return example['task'] == 'qg'

def filter_e2e_qg(example):
    return example['task'] == 'e2e_qg'

def filter_ans_ext(example):
    return example['task'] == 'ans_ext'

def filter_multi(example):
    return example['task'] != 'e2e_qg'


TASK_TO_FILTER_FN = {
    'qa': filter_qa,
    'qg': filter_qg,
    'e2e_qg': filter_e2e_qg,
    'ans_ext': filter_ans_ext,
    'multi': filter_multi
}

GDRIVE_PATH = '/content/drive/MyDrive'
DATASET_LOADER_CLASSPATH = os.path.join(GDRIVE_PATH, 'python/question-generator/squad_multitask/')
#DATASET_CACHE_DIR = os.path.join(GDRIVE_PATH, 'dataset/question-generator')
DATASET_PATH = os.path.join(GDRIVE_PATH,'dataset', 'question-generator')

In [None]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    task: str = field(
        metadata={"help": "Which task 'qa', 'qg', 'e2e_qg', 'ans_ext', 'multi'. 'multi' means 'qa', 'qg', 'ans_ext' tasks"}, 
    )
    model_type: str = field(metadata={"help": "One of 't5', 'bart'"})
    dataset_path: Optional[str] = field(
        default="data/squad_multitask",
        metadata={"help": "Path for dataset directory"}, 
    )
    train_file_name: Optional[str] = field(
        default=None,
        metadata={"help": "name for cached train dataset"},
    )
    valid_file_name: Optional[str] = field(
        default=None,
        metadata={"help": "name for cached valid dataset"},
    )
    valid_for_qg_only: bool = field(
        default=False,
        metadata={"help": "For multitask dataset valid split should contain only qg task or all tasks."}
    )
    qg_format: Optional[str] = field(
        default='highlight_qg_format',
        metadata={"help": "How to format inputs for que generation, 'highlight_qg_format' or 'prepend_qg_format'"}, 
    )
    max_source_length: Optional[int] = field(
        default=512,
        metadata={"help": "Max input length for the source text"},
    )
    max_target_length: Optional[int] = field(
        default=32,
        metadata={"help": "Max input length for the target text"},
    )


In [None]:
class DataProcessor:
    def __init__(self, tokenizer, model_type="t5", max_source_length=512, max_target_length=32):
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.model_type = model_type
        self.hl_token = "<hl>"
        
        if model_type == "t5":
            self.sep_token = "<sep>"
        elif model_type == "bart":
            self.sep_token = "<sep>"
        else:
            self.sep_token = "[SEP]"
  
    def process(self, dataset):
        if self.model_type == "t5":
            dataset = dataset.map(self._add_eos_examples)
        
        dataset = dataset.map(self._add_special_tokens)
        dataset = dataset.map(self._convert_to_features, batched=True)
        
        return dataset
  
    def _add_eos_examples(self, example):
        example['source_text'] = example['source_text'] + " </s>"
        example['target_text'] = example['target_text'] + " </s>"
        return example
  
    def _add_special_tokens(self, example):
        example['source_text'] = example['source_text'].replace("{hl_token}", self.hl_token)    
        example['target_text'] = example['target_text'].replace("{sep_token}", self.sep_token)
        return example
  
    # tokenize the examples
    def _convert_to_features(self, example_batch):
        source_encoding = self.tokenizer.batch_encode_plus(
            example_batch['source_text'],
            max_length=self.max_source_length,
            padding='max_length',
            pad_to_max_length=True,
            truncation=True, 
        )
        target_encoding = self.tokenizer.batch_encode_plus(
            example_batch['target_text'],
            max_length=self.max_target_length,
            padding='max_length',
            pad_to_max_length=True,
            truncation=True, 
        )

        encodings = {
            'source_ids': source_encoding['input_ids'], 
            'target_ids': target_encoding['input_ids'],
            'attention_mask': source_encoding['attention_mask'],
        }

        return encodings


In [None]:
data_args = DataTrainingArguments(task= 'e2e_qg',
                                  model_type='t5',
                                  dataset_path=DATASET_LOADER_CLASSPATH,
                                  qg_format= 'highlight_qg_format',
                                  train_file_name= 'train_data_e2e_qg_t5.pt',
                                  valid_file_name= 'valid_data_e2e_qg_t5.pt',
                                  valid_for_qg_only= True,
                                  max_source_length= 512, 
                                  max_target_length=32)

In [None]:
if data_args.model_type == 't5':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
else:
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

tokenizer.add_tokens(['<sep>', '<hl>'])

06/25/2021 08:09:43 - INFO - filelock -   Lock 140315681125840 acquired on /root/.cache/huggingface/transformers/684a47ca6257e4ca71f0037771464c5b323e945fbc58697d2fad8a7dd1a2f8ba.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…

06/25/2021 08:09:44 - INFO - filelock -   Lock 140315681125840 released on /root/.cache/huggingface/transformers/684a47ca6257e4ca71f0037771464c5b323e945fbc58697d2fad8a7dd1a2f8ba.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d.lock





06/25/2021 08:09:45 - INFO - filelock -   Lock 140313000610192 acquired on /root/.cache/huggingface/transformers/90de37880b5ff5ac7ab70ff0bd369f207e9b74133fa153c163d14c5bb0116207.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…

06/25/2021 08:09:46 - INFO - filelock -   Lock 140313000610192 released on /root/.cache/huggingface/transformers/90de37880b5ff5ac7ab70ff0bd369f207e9b74133fa153c163d14c5bb0116207.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529.lock





2

In [None]:
train_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.VALIDATION)

06/25/2021 08:09:48 - INFO - nlp.load -   Checking /content/drive/MyDrive/python/question-generator/squad_multitask/squad_multitask.py for additional imports.
06/25/2021 08:09:49 - INFO - filelock -   Lock 140312746422992 acquired on /content/drive/MyDrive/python/question-generator/squad_multitask/squad_multitask.py.lock
06/25/2021 08:09:49 - INFO - nlp.load -   Creating main folder for dataset /content/drive/MyDrive/python/question-generator/squad_multitask/squad_multitask.py at /usr/local/lib/python3.7/dist-packages/nlp/datasets/squad_multitask
06/25/2021 08:09:49 - INFO - nlp.load -   Creating specific version folder for dataset /content/drive/MyDrive/python/question-generator/squad_multitask/squad_multitask.py at /usr/local/lib/python3.7/dist-packages/nlp/datasets/squad_multitask/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd
06/25/2021 08:09:49 - INFO - nlp.load -   Copying script file from /content/drive/MyDrive/python/question-generator/squad_multitask/squad_mu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


06/25/2021 08:09:50 - INFO - nlp.builder -   Generating dataset squad_multitask (/root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd)
06/25/2021 08:09:50 - INFO - nlp.builder -   Dataset not on Hf google storage. Downloading and preparing it from source


Downloading and preparing dataset squad_multitask/highlight_qg_format (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd...


06/25/2021 08:09:51 - INFO - filelock -   Lock 140312745694992 acquired on /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.9695451692117d531c0343c7e7234cdd3c713e288e1db8c91402f7f95478bae4.lock
06/25/2021 08:09:51 - INFO - nlp.utils.file_utils -   https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmp8g7fkabb


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=8116577.0, style=ProgressStyle(descript…

06/25/2021 08:09:51 - INFO - nlp.utils.file_utils -   storing https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json in cache at /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.9695451692117d531c0343c7e7234cdd3c713e288e1db8c91402f7f95478bae4
06/25/2021 08:09:51 - INFO - nlp.utils.file_utils -   creating metadata file for /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.9695451692117d531c0343c7e7234cdd3c713e288e1db8c91402f7f95478bae4
06/25/2021 08:09:51 - INFO - filelock -   Lock 140312745694992 released on /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.9695451692117d531c0343c7e7234cdd3c713e288e1db8c91402f7f95478bae4.lock
06/25/2021 08:09:51 - INFO - filelock -   Lock 140316306449296 acquired on /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d25986




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054280.0, style=ProgressStyle(descript…

06/25/2021 08:09:52 - INFO - nlp.utils.file_utils -   storing https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json in cache at /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.6bb6d6588eb34fa38d0c0d111217bede75b4e8486d88939efe032a4e565949be
06/25/2021 08:09:52 - INFO - nlp.utils.file_utils -   creating metadata file for /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.6bb6d6588eb34fa38d0c0d111217bede75b4e8486d88939efe032a4e565949be
06/25/2021 08:09:52 - INFO - filelock -   Lock 140316306449296 released on /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.6bb6d6588eb34fa38d0c0d111217bede75b4e8486d88939efe032a4e565949be.lock
06/25/2021 08:09:52 - INFO - nlp.utils.info_utils -   Unable to verify checksums.
06/25/2021 08:09:52 - INFO - nlp.builder -   Generating split train





HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

06/25/2021 08:09:52 - INFO - root -   generating examples from = /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.9695451692117d531c0343c7e7234cdd3c713e288e1db8c91402f7f95478bae4
06/25/2021 08:10:05 - INFO - nlp.arrow_writer -   Done writing 253276 examples in 226286197 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd.incomplete/squad_multitask-train.arrow.
06/25/2021 08:10:05 - INFO - nlp.builder -   Generating split validation




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

06/25/2021 08:10:05 - INFO - root -   generating examples from = /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.6bb6d6588eb34fa38d0c0d111217bede75b4e8486d88939efe032a4e565949be
06/25/2021 08:10:07 - INFO - nlp.arrow_writer -   Done writing 30020 examples in 27698388 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd.incomplete/squad_multitask-validation.arrow.
06/25/2021 08:10:07 - INFO - nlp.utils.info_utils -   Unable to verify splits sizes.
06/25/2021 08:10:07 - INFO - nlp.builder -   Constructing Dataset for split train, from /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd
06/25/2021 08:10:07 - INFO - nlp.utils.info_utils -   Unable to verify checksums.
06/25/2021 08:10:07 - INFO - nlp.load -   Checking /content/drive/MyDrive/python/qu

Dataset squad_multitask downloaded and prepared to /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd. Subsequent calls will reuse this data.


In [None]:
train_dataset = train_dataset.filter(TASK_TO_FILTER_FN[data_args.task])
valid_dataset = valid_dataset.filter(TASK_TO_FILTER_FN[data_args.task])

06/25/2021 08:10:07 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-92d152c091bbb8dad9ee220e5d2582d0.arrow


HBox(children=(FloatProgress(value=0.0, max=254.0), HTML(value='')))

06/25/2021 08:10:09 - INFO - nlp.arrow_writer -   Done writing 18896 examples in 20963347 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpadk8dip1.
06/25/2021 08:10:09 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-b5a6940af96fa5e525315bd9f9324976.arrow





HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

06/25/2021 08:10:09 - INFO - nlp.arrow_writer -   Done writing 2067 examples in 2451567 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmp2cg7gk20.





In [None]:
processor = DataProcessor(
        tokenizer,
        model_type=data_args.model_type,
        max_source_length=data_args.max_source_length,
        max_target_length=data_args.max_target_length
    )

In [None]:
train_dataset = processor.process(train_dataset)
valid_dataset = processor.process(valid_dataset)

06/25/2021 08:10:09 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-72a82aeb129a89e197163e1e86cbb222.arrow


HBox(children=(FloatProgress(value=0.0, max=18896.0), HTML(value='')))

06/25/2021 08:10:11 - INFO - nlp.arrow_writer -   Done writing 18896 examples in 21149487 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpgwjinyws.
06/25/2021 08:10:11 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-7a7a27956dd4d7acc49e060de8a64f3c.arrow





HBox(children=(FloatProgress(value=0.0, max=18896.0), HTML(value='')))

06/25/2021 08:10:11 - INFO - nlp.arrow_writer -   Done writing 18896 examples in 20623893 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpy8sxy6lo.
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
06/25/2021 08:10:11 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-8b323f77c1b4464923f8d7531ec10784.arrow





HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

06/25/2021 08:11:07 - INFO - nlp.arrow_writer -   Done writing 18896 examples in 180484281 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpjqiknihh.
06/25/2021 08:11:07 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-68efd14c8ccb132d4bdba4ff700971c9.arrow





HBox(children=(FloatProgress(value=0.0, max=2067.0), HTML(value='')))




06/25/2021 08:11:07 - INFO - nlp.arrow_writer -   Done writing 2067 examples in 2471901 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmp7umwggm_.
06/25/2021 08:11:07 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-16f0d0e2b0c1df346740521d2a2cbd53.arrow


HBox(children=(FloatProgress(value=0.0, max=2067.0), HTML(value='')))

06/25/2021 08:11:08 - INFO - nlp.arrow_writer -   Done writing 2067 examples in 2408481 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpv_ccs0ou.
06/25/2021 08:11:08 - INFO - nlp.arrow_dataset -   Caching processed dataset at /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/cache-827e3cc8449f75266b2e3ad108608db0.arrow





HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

06/25/2021 08:11:14 - INFO - nlp.arrow_writer -   Done writing 2067 examples in 19895337 bytes /root/.cache/huggingface/datasets/squad_multitask/highlight_qg_format/1.0.0/79eda69e803ef0edf75970022ebdffc3b92a11d258088c947b94a6d01b2cddfd/tmpn4q0gcty.





In [None]:
valid_dataset[0]['source_text']

'generate questions: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50. </s>'

In [None]:
columns = ["source_ids", "target_ids", "attention_mask"]
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

if data_args.train_file_name is None:
    train_file_name = f"train_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt"
    train_path = os.path.join(DATASET_PATH, train_file_name)

    valid_file_name = f"valid_data_{data_args.task}_{data_args.qg_format}_{data_args.model_type}.pt"
    valid_path = os.path.join(DATASET_PATH, valid_file_name)
else:
    train_path = os.path.join(DATASET_PATH, data_args.train_file_name)
    valid_path = os.path.join(DATASET_PATH, data_args.valid_file_name)
    
torch.save(train_dataset, train_path)
logger.info(f"saved train dataset at {train_path}")

torch.save(valid_dataset, valid_path)
logger.info(f"saved validation dataset at {valid_path}")

tokenizer_path = f"{data_args.model_type}_qg_tokenizer"
if not os.path.exists(tokenizer_path):
    os.mkdir(tokenizer_path)
tokenizer.save_pretrained(tokenizer_path)
logger.info(f"saved tokenizer at {tokenizer_path}")

06/25/2021 08:11:14 - INFO - nlp.arrow_dataset -   Set __getitem__(key) output type to torch for ['source_ids', 'target_ids', 'attention_mask'] columns  (when key is int or slice) and don't output other (un-formated) columns.
06/25/2021 08:11:14 - INFO - nlp.arrow_dataset -   Set __getitem__(key) output type to torch for ['source_ids', 'target_ids', 'attention_mask'] columns  (when key is int or slice) and don't output other (un-formated) columns.
06/25/2021 08:11:16 - INFO - __main__ -   saved train dataset at /content/drive/MyDrive/dataset/question-generator/train_data_e2e_qg_t5.pt
06/25/2021 08:11:17 - INFO - __main__ -   saved validation dataset at /content/drive/MyDrive/dataset/question-generator/valid_data_e2e_qg_t5.pt
06/25/2021 08:11:17 - INFO - __main__ -   saved tokenizer at t5_qg_tokenizer
