In [None]:
import math
import os

import nemo
from nemo.utils.lr_policies import WarmupAnnealing

import nemo.collections.nlp as nemo_nlp
from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
from nemo.collections.nlp.callbacks.token_classification_callback import \
    eval_iter_callback, eval_epochs_done_callback
from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
from nemo.collections.nlp.nm.trainables import TokenClassifier
from nemo import logging

You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/examples/nlp/token_classification/import_from_iob_format.py) script to preprocess it.

In [None]:
BATCHES_PER_STEP = 1
BATCH_SIZE = 32
CLASSIFICATION_DROPOUT = 0.1
DATA_DIR = "PATH TO WHERE THE DATA IS"
WORK_DIR = "PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS"
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 0.00005
LR_WARMUP_PROPORTION = 0.1
OPTIMIZER = "adam"

In [None]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/token_classification.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level="O0",
    
    # Define path to the directory you want to store your results
    log_dir=WORK_DIR,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()
tokenizer = NemoBertTokenizer(pretrained_model="bert-base-cased")
bert_model = nemo_nlp.nm.trainables.huggingface.BERT(
    pretrained_model_name="bert-base-cased")

In [None]:
# Describe training DAG
train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_train.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE)

label_ids = train_data_layer.dataset.label_ids
num_classes = len(label_ids)

hidden_size = bert_model.hidden_size
ner_classifier = TokenClassifier(hidden_size=hidden_size,
                                          num_classes=num_classes,
                                          dropout=CLASSIFICATION_DROPOUT)

ner_loss = CrossEntropyLossNM(logits_dim=3)

input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

logits = ner_classifier(hidden_states=hidden_states)
loss = ner_loss(logits=logits, labels=labels, loss_mask=loss_mask)

In [None]:
# Describe evaluation DAG
eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE,
        label_ids=label_ids)

eval_input_ids, eval_input_type_ids, eval_input_mask, _, eval_subtokens_mask, eval_labels \
    = eval_data_layer()

hidden_states = bert_model(
    input_ids=eval_input_ids,
    token_type_ids=eval_input_type_ids,
    attention_mask=eval_input_mask)

eval_logits = ner_classifier(hidden_states=hidden_states)

In [None]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

# Callback to evaluate the model
callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_labels, eval_subtokens_mask],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids),
    eval_step=steps_per_epoch)

# Callback to store checkpoints
# Checkpoints will be stored in checkpoints folder inside WORK_DIR
ckpt_callback = nemo.core.CheckpointCallback(
    folder=neural_factory.checkpoint_dir,
    epoch_freq=1)

In [None]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
neural_factory.train(
    tensors_to_optimize=[loss],
    callbacks=[callback_train, callback_eval, ckpt_callback],
    lr_policy=lr_policy,
    batches_per_step=BATCHES_PER_STEP,
    optimizer=OPTIMIZER,
    optimization_params={
        "num_epochs": NUM_EPOCHS,
        "lr": LEARNING_RATE
    })