In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
# All Imports

import argparse
import logging
import os
import io
import sys
import math
import random

# iteration
from tqdm.auto import tqdm

# remote logging
# comet_ml stands before because of ImportError: import Comet before modules: torch
import comet_ml 
import wandb

# objects properties extraction
from dataclasses import asdict

# 🤗 Datasets 
import datasets
from datasets import load_dataset, load_metric

# dataclasses and types
from dataclasses import field, dataclass
from typing import Dict, List, Union, Optional

# dataset managements
# from torch.utils.data.dataloader import DataLoader
from torch.utils.data import Dataset, DataLoader

# data managements
import json # load/write data
import torch 
import numpy as np
import pandas as pd

# accelerator for speed up experiments/runs
# from accelerate import Accelerator

# 🤗 Tranformers
import transformers
from transformers import (
    Trainer, # For using Trainer instead of our custom loop
    AdamW,
    AutoTokenizer, 
    AutoConfig,
    HfArgumentParser,
    TrainingArguments as HfTrainingArguments,
    SchedulerType,
    get_scheduler,
    set_seed,
)

from transformers.trainer_callback import (
    DefaultFlowCallback,
    PrinterCallback, 
    ProgressCallback
)

from transformers.integrations import (
    TensorBoardCallback,
    WandbCallback,
    CometCallback,
    MLflowCallback,
    AzureMLCallback
)

_callback_factory: Dict = {
    "DefaultFlowCallback": DefaultFlowCallback,
    "PrinterCallback": PrinterCallback, 
    "ProgressCallback": ProgressCallback,
    "TensorBoardCallback": TensorBoardCallback,
    "WandbCallback": WandbCallback,
    "CometCallback": CometCallback,
    "MLflowCallback": MLflowCallback,
    "AzureMLCallback": AzureMLCallback
}

# tokenization utils
from thesis.utils.tokenization.helper import _speed_tokenization

# checkpoint utils
from transformers.trainer_utils import get_last_checkpoint
    
logger = logging.getLogger(__name__)

In [4]:
# Imports for mlm (mlm_task)

"""
Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
on a text file or a dataset without using HuggingFace Trainer.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=masked-lm
"""

# 🤗 Tranformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoModel, 
    AutoModelForMaskedLM,
    PreTrainedTokenizer, 
    DataCollatorForLanguageModeling, 
    BertForMaskedLM
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version

# parsing 
from thesis.utils.parsers.args_parser import parse_args
from thesis.utils.general import load_dataset_wrapper

# logging
from thesis.utils.config.execution import LogConfig

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

DICTIONARY_FIELD_NAMES = dict(
    train         = ['train'],
    test          = ['test', 'debug', 'dev'],
    validation    = ['validation', 'valid']
)

# Imports for glue (seq_class)

""" Finetuning a 🤗 Transformers model for sequence classification on GLUE."""

# 🤗 Tranformers
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    default_data_collator,
)

from lensNLP import (
    mlm_task,
    seq_class
)

In [5]:
# Logging on Comet.ml
comet_ml.init()

COMET INFO: Comet API key is valid


In [6]:
# Logging on WandB
wandb.login()

# Optional: log both gradients and parameters
%env WANDB_WATCH=all

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33memanuelevivoli[0m (use `wandb login --relogin` to force relogin)


env: WANDB_WATCH=all


In [7]:
def main(args_list):

    # ------------------
    # Parsing arguments
    # ------------------
    
    # If we pass only one argument to the script and it's the path to a json file, it parses it to get our arguments.
    # If we pass a list of pairs '--option', 'argument', it uses those to get arguments.
    dataset_args, training_args, model_args, run_args, log_args, embedding_args = parse_args(args_list)
    log_config = LogConfig(**asdict(log_args))
    
    # ------------------
    # Checkpoints
    # ------------------
    
    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    
    # ------------------
    # Logging definition
    # ------------------

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # ------------------
    # Setting seed
    # ------------------
    
    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)

    # ------------------
    # Getting the datasets
    # ------------------

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    # You can also provide the name of some other dataset ('S2orc', 'Keyphrase') that we customly support.
    #
    # For the 'S2orc' dataset we provide a caching mechanisms in order to speed-up the preprocessing.
    # The cache files changes (and are recalculated) everytime the configurations changes.
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if dataset_args.dataset_name is not None:
        # Getting the load_dataset wrapper that manages huggingface dataset and the custom ones
        custom_load_dataset = load_dataset_wrapper()
        # Loading the raw data based on input (and default) values of arguments
        raw_datasets = custom_load_dataset(dataset_args, training_args, model_args, run_args, log_args, embedding_args)
    else:
        # If the files 'train_file' and 'validation_file' are specified
        # data_files is composed by those elements.
        data_files = {}
        if dataset_args.train_file is not None:
            data_files["train"] = dataset_args.train_file
        if dataset_args.validation_file is not None:
            data_files["validation"] = dataset_args.validation_file
        extension = dataset_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        elif extension == "jsonl": # jsonl files are file with json element per row
            extension = "json"
        raw_datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # The Datasets in the raw form can have different form of key names (depending on the configuration).
    # We need all datasets to contain 'train', 'test', 'validation' keys, if not we change the dictionary keys' name
    # based on the `names_tuple` and conseguently on `names_map`.
    def format_key_names(raw_datasets):
        # The creation of `names_map` happens to be here
        # For every element in the values lists, one dictionary entry is added 
        # with (k,v): k=Value of the list, v=Key such as 'train', etc.
        def names_dict_generator(names_tuple: dict):
            names_map = dict()
            for key, values in names_tuple.items():
                for value in values:
                    names_map[value] = key
            return names_map
        names_map = names_dict_generator(DICTIONARY_FIELD_NAMES)
        split_names = raw_datasets.keys()
        for split_name in split_names:
            new_split_name = names_map.get(split_name)
            if split_name != new_split_name:
                raw_datasets[new_split_name] = raw_datasets.pop(split_name)  
        return raw_datasets

    logger.info(f"Formatting DatasetDict keys")
    datasets = format_key_names(raw_datasets)

    # ------------------
    # Load tokenizer and
    # pretrained model
    # ------------------

    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if dataset_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if dataset_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({dataset_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(dataset_args.max_seq_length, tokenizer.model_max_length)

    tokenized_datasets = _speed_tokenization(
        dataset_args,
        training_args,
        tokenizer,
        max_seq_length,
        datasets,
        text_column_name,
        column_names
    ) 

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if dataset_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(dataset_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if dataset_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(dataset_args.max_eval_samples))

    # Data collator
    # This one will take care of randomly masking the tokens.
    pad_to_multiple_of_8 = dataset_args.line_by_line and training_args.fp16 and not training_args.pad_to_max_length
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=dataset_args.mlm_probability,
        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[_callback_factory[callback] for callback in log_config.callbacks]
    )

    logging.info(f"trainer: {trainer}")

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (
            dataset_args.max_train_samples if dataset_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = dataset_args.max_eval_samples if dataset_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        perplexity = math.exp(metrics["eval_loss"])
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.push_to_hub:
        kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "fill-mask"}
        if training_args.dataset_name is not None:
            kwargs["dataset_tags"] = training_args.dataset_name
            if training_args.dataset_config_name is not None:
                kwargs["dataset_args"] = training_args.dataset_config_name
                kwargs["dataset"] = f"{training_args.dataset_name} {training_args.dataset_config_name}"
            else:
                kwargs["dataset"] = training_args.dataset_name

        trainer.push_to_hub(**kwargs)


    wandb.finish()


In [8]:
if __name__ == "__main__":
    args_list = [

        "--overwrite_output_dir"         , 'True',
        
        "--do_train"                     , 'True',
        "--do_eval"                      , 'True',
        "--do_predict"                   , 'True', 

          # DatasetArguments
        "--model_name_or_path"           , "allenai/scibert_scivocab_uncased",
        "--dataset_name"                 , "s2orc", # "keyphrase",
        "--dataset_config_name"          , "full",  # "inspec",
        
        # TrainingArguments        
            # seed for reproducibility of experiments
        "--seed"                         , '1234', 
        "--output_dir"                   , "/home/vivoli/Thesis/output",
        # "--debug"                        , "", # it is not what we think it is
        
        "--run_name"                     , "scibert-s2orc", # "scibert-keyph",
        "--num_train_epochs"             , '1',
        "--per_device_train_batch_size"  , "8", # 16 and 32 end with "RuntimeError: CUDA out of memory."
        "--per_device_eval_batch_size"   , "8", # 16 and 32 end with "RuntimeError: CUDA out of memory."
            # custom added
        "--max_seq_length"               , '512',
        
        # S2orcArguments & KeyPhArguments
        "--dataset_path"                 , "/home/vivoli/Thesis/data",
        "--data"                         , "abstract",
        "--target"                       , "title",             
        "--classes"                      , "mag_field_of_study", # "keywords",
        
        # S2orcArguments
        "--idxs"                         , '0',
        "--zipped"                       , 'True',
            # list
        "--mag_field_of_study"           , "Computer Science",    
        #     # list
        # "--data"                         , "abstract",
        #     # list
        # "--target"                       , "title",             
        #     # list
        # "--classes"                      , "mag_field_of_study",
        "--keep_none_papers"             , 'False',
        "--keep_unused_columns"          , 'False',
        
        # RunArguments
        # "--run_name"                     , "scibert-s2orc",
        "--run_number"                   , '0',
        "--run_iteration"                , '0',
        
        # LoggingArguments
        "--verbose"                      , 'True',
        "--debug_log"                    , 'True',
        "--time"                         , 'False',
        "--callbacks"                     , "WandbCallback,CometCallback,TensorBoardCallback",
        
        # EmbeddingArguments
        # "--max_seq_length"               , '512',
        # "--pooling"                      , 'none',
        # "--batch_size"                   , '32'
    ]
    
    main(args_list)

05/15/2021 08:09:40 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/home/vivoli/Thesis/output', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='./logs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=500, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=None, no_cuda=False, seed=1234, fp16=False, fp16_opt_level='O1', fp16_backend='auto', fp16_full_eval=F

[INFO|configuration_utils.py:517] 2021-05-15 08:09:52,784 >> loading configuration file https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json from cache at /home/vivoli/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd
[INFO|configuration_utils.py:553] 2021-05-15 08:09:52,784 >> Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

[INFO|configuration_utils.py:517] 2021-

HBox(children=(FloatProgress(value=0.0, max=72.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=72.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




:DefaultFlowCallback
CometCallback
TensorBoardCallback
WandbCallback
:DefaultFlowCallback
CometCallback
TensorBoardCallback
WandbCallback
WandbCallback
:DefaultFlowCallback
CometCallback
TensorBoardCallback
WandbCallback
WandbCallback
CometCallback
[INFO|trainer.py:516] 2021-05-15 08:11:26,401 >> The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1156] 2021-05-15 08:11:26,408 >> ***** Running training *****
[INFO|trainer.py:1157] 2021-05-15 08:11:26,408 >>   Num examples = 23155
[INFO|trainer.py:1158] 2021-05-15 08:11:26,409 >>   Num Epochs = 1
[INFO|trainer.py:1159] 2021-05-15 08:11:26,409 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1160] 2021-05-15 08:11:26,410 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1161] 2021-05-15 08:11:26,410 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1162] 2021-05-15 

[INFO|integrations.py:675] 2021-05-15 08:11:33,158 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


ValueError: You can only call `wandb.watch` once per model.  Pass a new instance of the model if you need to call wandb.watch again in your code.