In [1]:
import math
import os

import nemo
from nemo.utils.lr_policies import WarmupAnnealing

import nemo.collections.nlp as nemo_nlp
from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
from nemo.collections.nlp.callbacks.token_classification_callback import \
    eval_iter_callback, eval_epochs_done_callback
from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
from nemo.collections.nlp.nm.trainables import TokenClassifier
from nemo import logging



You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/examples/nlp/token_classification/import_from_iob_format.py) script to preprocess it.

In [2]:
BATCHES_PER_STEP = 1
BATCH_SIZE = 32
CLASSIFICATION_DROPOUT = 0.1
DATA_DIR = "/workspace/nemo/examples/nlp/token_classification/BERT-NER/data"
WORK_DIR = "./logs"
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 0.00005
LR_WARMUP_PROPORTION = 0.1
PRETRAINED_BERT_MODEL = "bert-base-cased"
OPTIMIZER = "adam"

In [3]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/token_classification.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level="O0",
    
    # Define path to the directory you want to store your results
    log_dir=WORK_DIR,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [4]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT/ALBERT/RoBERTa model names, call nemo_nlp.nm.trainables.get_bert_models_list()

tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)
bert_model = nemo_nlp.nm.trainables.get_huggingface_model(pretrained_model_name=PRETRAINED_BERT_MODEL)

[NeMo I 2020-06-14 22:44:30 bert_tokenizer:78] Deriving bert model type from pretrained model name.


In [5]:
# Describe training DAG
train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_train.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE)

label_ids = train_data_layer.dataset.label_ids
num_classes = len(label_ids)

hidden_size = bert_model.hidden_size
ner_classifier = TokenClassifier(hidden_size=hidden_size,
                                          num_classes=num_classes,
                                          dropout=CLASSIFICATION_DROPOUT)

ner_loss = CrossEntropyLossNM(logits_ndim=3)

input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

logits = ner_classifier(hidden_states=hidden_states)
loss = ner_loss(logits=logits, labels=labels, loss_mask=loss_mask)

[NeMo I 2020-06-14 22:44:39 token_classification_dataset:273] Creating a new label to label_id dictionary. It's recommended to use label_ids generated during training for dev/test sets to avoid errors if some labels are not present in the dev/test sets. For training set label_ids should be None.
[NeMo I 2020-06-14 22:44:49 token_classification_dataset:116] Max length: 128
[NeMo I 2020-06-14 22:44:49 data_preprocessing:245] Min: 3 |                  Max: 173 |                  Mean: 21.41421551171569 |                  Median: 17.0
[NeMo I 2020-06-14 22:44:49 data_preprocessing:247] 75 percentile: 30.0
[NeMo I 2020-06-14 22:44:49 data_preprocessing:248] 99 percentile: 58.0


[NeMo W 2020-06-14 22:44:50 token_classification_dataset:145] 1 are longer than 128


[NeMo I 2020-06-14 22:44:50 token_classification_dataset:148] *** Example ***
[NeMo I 2020-06-14 22:44:50 token_classification_dataset:149] i: 0
[NeMo I 2020-06-14 22:44:50 token_classification_dataset:150] subtokens: [CLS] EU rejects German call to boycott British la ##mb . [SEP]
[NeMo I 2020-06-14 22:44:50 token_classification_dataset:151] loss_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[NeMo I 2020-06-14 22:44:50 token_classification_dataset:152] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[NeMo I 2020-06-14 22:44:50 token_classification_dataset:15

In [6]:
# Describe evaluation DAG
eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE,
        label_ids=label_ids)

eval_input_ids, eval_input_type_ids, eval_input_mask, _, eval_subtokens_mask, eval_labels \
    = eval_data_layer()

hidden_states = bert_model(
    input_ids=eval_input_ids,
    token_type_ids=eval_input_type_ids,
    attention_mask=eval_input_mask)

eval_logits = ner_classifier(hidden_states=hidden_states)

[NeMo I 2020-06-14 22:44:54 token_classification_dataset:265] Using the provided label_ids dictionary.
[NeMo I 2020-06-14 22:44:57 token_classification_dataset:116] Max length: 128
[NeMo I 2020-06-14 22:44:57 data_preprocessing:245] Min: 3 |                  Max: 151 |                  Mean: 22.90707692307692 |                  Median: 18.0
[NeMo I 2020-06-14 22:44:57 data_preprocessing:247] 75 percentile: 32.0
[NeMo I 2020-06-14 22:44:57 data_preprocessing:248] 99 percentile: 62.0


[NeMo W 2020-06-14 22:44:57 token_classification_dataset:145] 4 are longer than 128


[NeMo I 2020-06-14 22:44:57 token_classification_dataset:148] *** Example ***
[NeMo I 2020-06-14 22:44:57 token_classification_dataset:149] i: 0
[NeMo I 2020-06-14 22:44:57 token_classification_dataset:150] subtokens: [CLS] CR ##IC ##KE ##T - L ##EI ##CE ##ST ##ER ##S ##H ##IR ##E T ##A ##KE O ##VE ##R AT TO ##P A ##FT ##ER IN ##NI ##NG ##S VI ##CT ##OR ##Y . [SEP]
[NeMo I 2020-06-14 22:44:57 token_classification_dataset:151] loss_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[NeMo I 2020-06-14 22:44:57 token_classification_dataset:152] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [7]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

# Callback to evaluate the model
callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_labels, eval_subtokens_mask],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids),
    eval_step=steps_per_epoch)

# Callback to store checkpoints
# Checkpoints will be stored in checkpoints folder inside WORK_DIR
ckpt_callback = nemo.core.CheckpointCallback(
    folder=neural_factory.checkpoint_dir,
    epoch_freq=1)

In [8]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
neural_factory.train(
    tensors_to_optimize=[loss],
    callbacks=[callback_train, callback_eval, ckpt_callback],
    lr_policy=lr_policy,
    batches_per_step=BATCHES_PER_STEP,
    optimizer=OPTIMIZER,
    optimization_params={
        "num_epochs": NUM_EPOCHS,
        "lr": LEARNING_RATE
    })

[NeMo I 2020-06-14 22:45:06 callbacks:186] Starting .....
[NeMo I 2020-06-14 22:45:06 callbacks:355] Found 2 modules with weights:
[NeMo I 2020-06-14 22:45:06 callbacks:357] BERT
[NeMo I 2020-06-14 22:45:06 callbacks:357] TokenClassifier
[NeMo I 2020-06-14 22:45:06 callbacks:358] Total model parameters: 108907785
[NeMo I 2020-06-14 22:45:06 callbacks:307] Found checkpoint folder ./logs/checkpoints. Will attempt to restore checkpoints from it.


[NeMo W 2020-06-14 22:45:06 callbacks:324] For module TokenClassifier, no file matches  in ./logs/checkpoints
[NeMo W 2020-06-14 22:45:06 callbacks:326] Checkpoint folder ./logs/checkpoints was present but nothing was restored. Continuing training from random initialization.


[NeMo I 2020-06-14 22:45:06 callbacks:197] Starting epoch 0
[NeMo I 2020-06-14 22:45:08 callbacks:220] Step: 0
[NeMo I 2020-06-14 22:45:08 <ipython-input-7-444126ad7eab>:3] Loss: 2.275
[NeMo I 2020-06-14 22:45:08 callbacks:235] Step time: 0.5700497627258301 seconds
[NeMo I 2020-06-14 22:45:08 callbacks:440] Doing Evaluation ..............................
[NeMo I 2020-06-14 22:45:17 token_classification_callback:78] Sampled preds: [8 7 8 8 8 8 8 8 7 7 8 8 8 2 8 8 8 7 8 8]
[NeMo I 2020-06-14 22:45:17 token_classification_callback:79] Sampled labels: [3 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 3 0 0 0]
[NeMo I 2020-06-14 22:45:17 token_classification_callback:82] Accuracy: 0.01843407771152422
[NeMo I 2020-06-14 22:45:17 token_classification_callback:86] F1 weighted: 0.68
[NeMo I 2020-06-14 22:45:17 token_classification_callback:86] F1 macro: 1.49
[NeMo I 2020-06-14 22:45:17 token_classification_callback:86] F1 micro: 1.84
[NeMo I 2020-06-14 22:45:17 token_classification_callback:89]                 