In [None]:
import math
import numpy as np
import os

import nemo
from nemo import logging
from nemo.utils.lr_policies import WarmupAnnealing

import nemo.collections.nlp as nemo_nlp
from nemo.collections.nlp.data import NemoBertTokenizer
from nemo.collections.nlp.nm.trainables import TokenClassifier
from nemo.backends.pytorch.common.losses import CrossEntropyLossNM, LossAggregatorNM
from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback
from nemo.collections.nlp.data.datasets.datasets_utils import calc_class_weights

DATA_DIR = "PATH_TO_WHERE_THE_DATA_IS"
WORK_DIR = "PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS"
PRETRAINED_BERT_MODEL = "bert-base-uncased"

# model parameters
BATCHES_PER_STEP = 1
BATCH_SIZE = 128
CLASSIFICATION_DROPOUT = 0.1
MAX_SEQ_LENGTH = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.00002
LR_WARMUP_PROPORTION = 0.1
OPTIMIZER = "adam"
STEP_FREQ = 200 # determines how often loss will be printed and checkpoint saved
PUNCT_NUM_FC_LAYERS = 3
NUM_SAMPLES = 100000

# Download and preprocess the data

In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/examples/nlp/token_classification/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/examples/nlp/token_classification/get_tatoeba_data.py) to download and preprocess the Tatoeba data.

In [None]:
# This should take about a minute since the data is already downloaded in the previous step

! python get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES

After the previous step, you should have a `DATA_DIR` folder with the following files:
- labels_train.txt
- labels_dev.txt
- text_train.txt
- text_dev.txt

The format of the data described in NeMo docs.

# Define Neural Modules

In [None]:
# Instantiate neural factory with supported backend
nf = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/token_classification.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level="O1",
    
    # Define path to the directory you want to store your results
    log_dir=WORK_DIR,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()

tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)
bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)

# Describe training DAG

In [None]:
train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(
     tokenizer=tokenizer,
     text_file=os.path.join(DATA_DIR, 'text_train.txt'),
     label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
     max_seq_length=MAX_SEQ_LENGTH,
     batch_size=BATCH_SIZE)

punct_label_ids = train_data_layer.dataset.punct_label_ids
capit_label_ids = train_data_layer.dataset.capit_label_ids


# Define classifier for Punctuation and Capitalization tasks
punct_classifier = TokenClassifier(
    hidden_size=bert_model.hidden_size,
    num_classes=len(punct_label_ids),
    dropout=CLASSIFICATION_DROPOUT,
    num_layers=PUNCT_NUM_FC_LAYERS,
    name='Punctuation')

capit_classifier = TokenClassifier(
    hidden_size=bert_model.hidden_size,
    num_classes=len(capit_label_ids),
    dropout=CLASSIFICATION_DROPOUT,
    name='Capitalization')


# If you don't want to use weighted loss for Punctuation task, use class_weights=None
punct_label_freqs = train_data_layer.dataset.punct_label_frequencies
class_weights = calc_class_weights(punct_label_freqs)

# define loss
punct_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights)
capit_loss = CrossEntropyLossNM(logits_dim=3)
task_loss = LossAggregatorNM(num_inputs=2)

In [None]:
input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels = train_data_layer()

hidden_states = bert_model(
    input_ids=input_ids,
    token_type_ids=input_type_ids,
    attention_mask=input_mask)

punct_logits = punct_classifier(hidden_states=hidden_states)
capit_logits = capit_classifier(hidden_states=hidden_states)

punct_loss = punct_loss(
    logits=punct_logits,
    labels=punct_labels,
    loss_mask=loss_mask)

capit_loss = capit_loss(
    logits=capit_logits,
    labels=capit_labels,
    loss_mask=loss_mask)

task_loss = task_loss(
    loss_1=punct_loss,
    loss_2=capit_loss)

# Describe evaluation DAG

In [None]:
# Note that you need to specify punct_label_ids and capit_label_ids  - mapping form labels to label_ids generated
# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from
# the train set are missing in the dev set.

eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(
    tokenizer=tokenizer,
    text_file=os.path.join(DATA_DIR, 'text_dev.txt'),
    label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),
    max_seq_length=MAX_SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    punct_label_ids=punct_label_ids,
    capit_label_ids=capit_label_ids)

eval_input_ids, eval_input_type_ids, eval_input_mask, _, eval_subtokens_mask, eval_punct_labels, eval_capit_labels\
    = eval_data_layer()

hidden_states = bert_model(
    input_ids=eval_input_ids,
    token_type_ids=eval_input_type_ids,
    attention_mask=eval_input_mask)

eval_punct_logits = punct_classifier(hidden_states=hidden_states)
eval_capit_logits = capit_classifier(hidden_states=hidden_states)

# Create callbacks

In [None]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[task_loss, punct_loss, capit_loss, punct_logits, capit_logits],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())),
    step_freq=STEP_FREQ)

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))
print ('Number of steps per epoch: ', steps_per_epoch)

# Callback to evaluate the model
callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_punct_logits,
    eval_capit_logits,
    eval_punct_labels,
    eval_capit_labels,
    eval_subtokens_mask],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x,
                                                      punct_label_ids,
                                                      capit_label_ids),
    eval_step=steps_per_epoch)

# Callback to store checkpoints
ckpt_callback = nemo.core.CheckpointCallback(
    folder=nf.checkpoint_dir,
    step_freq=STEP_FREQ)

# Training

In [None]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)

nf.train(tensors_to_optimize=[task_loss],
         callbacks=[callback_train, callback_eval, ckpt_callback],
         lr_policy=lr_policy,
         batches_per_step=BATCHES_PER_STEP,
         optimizer=OPTIMIZER,
         optimization_params={"num_epochs": NUM_EPOCHS,
                              "lr": LEARNING_RATE})

10 epochs of training on the subset of data, should take about 20 minutes on a single V100 GPU.
The model performance should be similar to the following:
    
                precision    recall  f1-score   support
           O       1.00      0.99      0.99    137268
           ,       0.58      0.95      0.72      2347
           .       0.99      1.00      1.00     19078
           ?       0.98      0.99      0.99      1151

    accuracy                           0.99    159844
    macro avg       0.89      0.98     0.92    159844
    weighted avg    0.99      0.99     0.99    159844

                precision    recall  f1-score   support
           O       1.00      1.00      1.00    136244
           U       1.00      0.99      0.99     23600

    accuracy                           1.00    159844
    macro avg       1.00      1.00     1.00    159844
    weighted avg    1.00      1.00     1.00    159844

# Inference

In [None]:
# Define the list of queiries for inference
queries = ['can i help you',
           'yes please',
           'we bought four shirts from the nvidia gear store in santa clara',
           'we bought four shirts one mug and ten thousand titan rtx graphics cards',
           'the more you buy the more you save']

In [None]:
infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(
    queries=queries,
    tokenizer=tokenizer,
    max_seq_length=MAX_SEQ_LENGTH,
    batch_size=1)

input_ids, input_type_ids, input_mask, _, subtokens_mask = infer_data_layer()

hidden_states = bert_model(
    input_ids=input_ids,
    token_type_ids=input_type_ids,
    attention_mask=input_mask)

punct_logits = punct_classifier(hidden_states=hidden_states)
capit_logits = capit_classifier(hidden_states=hidden_states)

evaluated_tensors = nf.infer(tensors=[punct_logits, capit_logits, subtokens_mask],
                             checkpoint_dir=WORK_DIR + '/checkpoints')

In [None]:
# helper functions
def concatenate(lists):
    return np.concatenate([t.cpu() for t in lists])

punct_ids_to_labels = {punct_label_ids[k]: k for k in punct_label_ids}
capit_ids_to_labels = {capit_label_ids[k]: k for k in capit_label_ids}

punct_logits, capit_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
punct_preds = np.argmax(punct_logits, axis=2)
capit_preds = np.argmax(capit_logits, axis=2)

for i, query in enumerate(queries):
    print(f'Query: {query}')

    punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]
    capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]

    words = query.strip().split()
    if len(punct_pred) != len(words) or len(capit_pred) != len(words):
        raise ValueError('Pred and words must be of the same length')

    output = ''
    for j, w in enumerate(words):
        punct_label = punct_ids_to_labels[punct_pred[j]]
        capit_label = capit_ids_to_labels[capit_pred[j]]

        if capit_label != 'O':
            w = w.capitalize()
        output += w
        if punct_label != 'O':
            output += punct_label
        output += ' '
    print(f'Combined: {output.strip()}\n')

The inference output should look something like this:<br>

Query: can i help you<br>
Combined: Can I help you?<br>

Query: yes please<br>
Combined: Yes, please.<br>

Query: we bought four shirts from the nvidia gear store in santa clara<br>
Combined: We bought four shirts from the Nvidia gear store in Santa Clara.<br>
            
Query: we bought four shirts one mug and ten thousand titan rtx graphics cards<br>
Combined: We bought four shirts, one mug, and ten thousand Titan Rtx graphics cards.<br>

Query: the more you buy the more you save<br>
Combined: The more you buy, the more you save.<br>

**Set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model.**