In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/biobert/NER_species"

/content/drive/My Drive/biobert/NER_species


In [3]:
!pip install import-ipynb
!pip install seqeval
!pip install transformers
import import_ipynb

from utils_ner import NerDataset, Split, get_labels

importing Jupyter notebook from utils_ner.ipynb


In [4]:
import logging
import os
import sys
import pdb
import subprocess

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score
from torch import nn

from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoModel,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from utils_ner import NerDataset, Split, get_labels

logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
    # or just modify its tokenizer_config.json.
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
    )
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )



In [5]:
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_json_file(json_file='/content/drive/My Drive/biobert/NER_species/args.json')
    # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
    #     # If we pass only one argument to the script and it's the path to a json file,
    #     # let's parse it to get our arguments.
    #     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    # else:
    #     model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    model_to_save = AutoModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )
    model_to_save.save_pretrained(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)
    # import pdb; pdb.set_trace()


    # Get datasets
    train_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]
        
        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        
        return {
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()
        
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)
    
    
    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )
        print('test_dataset:', test_dataset[0])
        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids)
        
        # Save predictions
        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                logger.info("***** Test results *****")
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        
        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
                    example_id = 0
                    for line in f:
                        if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                            writer.write(line)
                            if not preds_list[example_id]:
                                example_id += 1
                        elif preds_list[example_id]:
                            entity_label = preds_list[example_id].pop(0)
                            if entity_label == 'O':
                                output_line = line.split()[0] + " " + entity_label + "\n"
                            else:
                                output_line = line.split()[0] + " " + entity_label[0] + "\n"
                            # output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                            writer.write(output_line)
                        else:
                            logger.warning(
                                "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                            )
            

    return results


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()


12/10/2021 09:47:27 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each

wordssssssssssssssssssssssssssss: ['Evolutionary', 'comparisons', 'between', 'these', 'P450', 'genes', 'are', 'the', 'first', 'available', 'for', 'a', 'group', 'of', 'insect', 'genes', 'transcriptionally', 'regulated', 'by', 'hostplant', 'allelochemicals', 'and', 'provide', 'insights', 'into', 'the', 'process', 'by', 'which', 'insects', 'evolve', 'specialized', 'feeding', 'habits', '.']


12/10/2021 09:47:41 - INFO - utils_ner -   Saving features into cached file /content/drive/My Drive/biobert/NER_species/datasets_fold/v9_t0/cached_devel_BertTokenizer_50
***** Running training *****
  Num examples = 1947
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 488


Step,Training Loss


Saving model checkpoint to output_fold/output_v9_t0/checkpoint-200
Configuration saved in output_fold/output_v9_t0/checkpoint-200/config.json
Model weights saved in output_fold/output_v9_t0/checkpoint-200/pytorch_model.bin
Saving model checkpoint to output_fold/output_v9_t0/checkpoint-400
Configuration saved in output_fold/output_v9_t0/checkpoint-400/config.json
Model weights saved in output_fold/output_v9_t0/checkpoint-400/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to output_fold/output_v9_t0
Configuration saved in output_fold/output_v9_t0/config.json
Model weights saved in output_fold/output_v9_t0/pytorch_model.bin
tokenizer config file saved in output_fold/output_v9_t0/tokenizer_config.json
Special tokens file saved in output_fold/output_v9_t0/special_tokens_map.json
12/10/2021 09:52:51 - INFO - __main__ -   *** Evaluate ***
***** Running Evaluation *****
  Num examples = 256
  Batch size = 8


12/10/2021 09:52:54 - INFO - __main__ -   ***** Eval results *****
12/10/2021 09:52:54 - INFO - __main__ -     eval_loss = 0.06866694986820221
12/10/2021 09:52:54 - INFO - __main__ -     eval_precision = 0.89
12/10/2021 09:52:54 - INFO - __main__ -     eval_recall = 0.7542372881355932
12/10/2021 09:52:54 - INFO - __main__ -     eval_f1 = 0.8165137614678899
12/10/2021 09:52:54 - INFO - __main__ -     eval_runtime = 2.9823
12/10/2021 09:52:54 - INFO - __main__ -     eval_samples_per_second = 85.84
12/10/2021 09:52:54 - INFO - __main__ -     eval_steps_per_second = 10.73
12/10/2021 09:52:54 - INFO - __main__ -     epoch = 8.0
12/10/2021 09:52:54 - INFO - utils_ner -   Creating features from dataset file at /content/drive/My Drive/biobert/NER_species/datasets_fold/v9_t0/
12/10/2021 09:52:55 - INFO - utils_ner -   Writing example 0 of 216
12/10/2021 09:52:55 - INFO - utils_ner -   *** Example ***
12/10/2021 09:52:55 - INFO - utils_ner -   guid: test-1
12/10/2021 09:52:55 - INFO - utils_ner 

wordssssssssssssssssssssssssssss: ['PCR', 'protocol', 'was', 'developed', 'specific', 'to', 'the', 'transgenic', 'sperm', 'DNA', '.']


12/10/2021 09:52:55 - INFO - utils_ner -   Saving features into cached file /content/drive/My Drive/biobert/NER_species/datasets_fold/v9_t0/cached_test_BertTokenizer_50
***** Running Prediction *****
  Num examples = 216
  Batch size = 8


test_dataset: InputFeatures(input_ids=[101, 1126, 4184, 18809, 1279, 2585, 10436, 5053, 1202, 1775, 118, 170, 1477, 6117, 1887, 11626, 1114, 9077, 1121, 6531, 2114, 1104, 174, 12658, 26503, 3052, 18922, 170, 1744, 1116, 5250, 1566, 2225, 6758, 27555, 1105, 1110, 1107, 170, 21996, 5565, 10005, 119, 102, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], label_ids=[-100, 0, -100, -100, -100, 1, -100, -100, 2, -100, -100, -100, -100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100, -100, -100, 2, 2, 2, -100, 2, -100, -100, -100, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100])


12/10/2021 09:52:59 - INFO - __main__ -   ***** Test results *****
12/10/2021 09:52:59 - INFO - __main__ -     test_loss = 0.03681797534227371
12/10/2021 09:52:59 - INFO - __main__ -     test_precision = 0.7884615384615384
12/10/2021 09:52:59 - INFO - __main__ -     test_recall = 0.9111111111111111
12/10/2021 09:52:59 - INFO - __main__ -     test_f1 = 0.845360824742268
12/10/2021 09:52:59 - INFO - __main__ -     test_runtime = 2.9894
12/10/2021 09:52:59 - INFO - __main__ -     test_samples_per_second = 72.254
12/10/2021 09:52:59 - INFO - __main__ -     test_steps_per_second = 9.032
