In [None]:
import nemo
from nemo.utils.lr_policies import get_lr_policy
import nemo_nlp
from nemo_nlp.utils.callbacks.sentence_classification import \
    eval_iter_callback, eval_epochs_done_callback
import preproc_data_layer
from pytorch_transformers import BertTokenizer
import torch.nn.functional as f

import math
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = -1

import json

from preproc_data_layer import BertSentenceClassificationDataset, PreprocBertSentenceClassificationDataLayer
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
%matplotlib inline

## Data Explore

The SST-2 dataset https://nlp.stanford.edu/sentiment/index.html is a standard benchmark for sentence classification and is part of the GLUE Benchmark: https://gluebenchmark.com/tasks.

In [None]:
!sh get_data.sh

In [None]:
data_dir = 'data/SST-2'
df = pd.read_csv(data_dir + '/train.tsv', sep='\t')
test_df = pd.read_csv(data_dir + '/test.tsv', sep='\t')

In [None]:
df.head()

In [None]:
test_df

The dataset comes with a train file (labeled) and a test file (not labeled).  We will use part of the train file for model validation

In [None]:
# Split train to train and val and save to disk
np.random.seed(123)
train_mask = np.random.rand((len(df))) < .8
train_df = df[train_mask]
val_df = df[~train_mask]

In order to take advantage of NeMo's pre-built sentence classification data layer, the data must formatted as "sentence\tlabel" (sentence tab label).

In [None]:
# We will add a label column with all 0's (but they will not be used for anything).
test_df['label'] = 0

In [None]:
test_df = test_df[['sentence', 'label']]

In [None]:
test_df.head()

In [None]:
# Save new train, val, and test to disk
train_df.to_csv(data_dir + '/my_train.tsv', sep='\t', index=False)
val_df.to_csv(data_dir + '/my_val.tsv', sep='\t', index=False)
test_df.to_csv(data_dir + '/my_test.tsv', sep='\t', index=False)

# Data Preprocessing

In order to use BERT or any other Deep NLP based model, we must first tokenize the data. Our tokenizer will map each word to an integer.

In [None]:
# Each pretrained BERT model comes with a Tokenizer
pretrained_bert_model = 'bert-base-uncased'
#pretrained_bert_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_model)

In [None]:
max_seq_length = 64 # we will pad 0's to shorter sentences and truncate longer
sample_dataset = BertSentenceClassificationDataset(
data_dir + '/my_train.tsv',
max_seq_length,
tokenizer,
num_samples=100,
shuffle=False)

In [None]:
def print_tokenization(tokenizer, tokens):
    pad_counter = 0
    for token in tokens:
        if token == 0:
            pad_counter += 1
        if pad_counter > 4:
            break
        vocab_str = list(tokenizer.vocab.keys())[list(tokenizer.vocab.values()).index(token)]
        print(f'{vocab_str:15s} | {token}')

In [None]:
sample_idx = np.random.randint(0, len(sample_dataset))
tokens = sample_dataset[sample_idx][0]
print_tokenization(tokenizer, tokens)

We can gain a lot of efficiency by saving the tokenized data to disk. For future model runs we then don't need to tokenize every time.

In [None]:
!python preproc_data.py \
--input_file $f'{data_dir}/my_train.tsv' \
--output_dir $f'{data_dir}/preproc' \
--dataset_name 'train-sst-2' \
--max_seq_length $max_seq_length \
--pretrained_bert_model $pretrained_bert_model

In [None]:
!python preproc_data.py \
--input_file $f'{data_dir}/my_test.tsv' \
--output_dir $f'{data_dir}/preproc' \
--dataset_name "test-sst-2" \
--max_seq_length $max_seq_length \
--pretrained_bert_model $pretrained_bert_model

In [None]:
!python preproc_data.py \
--input_file $f'{data_dir}/my_val.tsv' \
--output_dir $f'{data_dir}/preproc' \
--dataset_name "val-sst-2" \
--max_seq_length $max_seq_length \
--pretrained_bert_model $pretrained_bert_model

## Neural Modules

In NeMo, everything is a Neural Module. Neural modules abstract data and neural network architectures. Where a deep learning framework like PyTorch or Tensorflow is used to combine neural network layers to create a neural network, NeMo is used to combine data and neural networks to create AI applications.

The Neural Module Factory will then manage the neural modules, taking care to flow data through the neural modules, and is also responsible for training (including mixed precision and distributed), logging, and inference.

In [None]:
# instantiate the neural module factory
log_dir = 'logs_' + pretrained_bert_model
checkpoint_dir = 'checkpoints_' + pretrained_bert_model
tensorboard_dir = 'tensorboard_' + pretrained_bert_model
nf = nemo.core.NeuralModuleFactory(log_dir=log_dir,
                                   checkpoint_dir=checkpoint_dir,
                                   tensorboard_dir=tensorboard_dir,
                                   create_tb_writer=True,
                                   add_time_to_log_dir=False,
                                   optimization_level='O1')

Pre-trained models will be automatically downloaded and cached.

In [None]:
# Pre-trained BERT
bert = nemo_nlp.BERT(pretrained_model_name=pretrained_bert_model)

In [None]:
# same bert model config for later use
bert_config_dict = bert.config.to_dict()

bert_model_config_path = pretrained_bert_model + '_config.json'
with open(bert_model_config_path, 'w+') as json_file:
    json.dump(bert_config_dict, json_file)

Note here that the BERT models we are working with are massive. This gives our models a large capacity for learning that is needed to understand the nuance and complexity of natural language.

In [None]:
print(f'{pretrained_bert_model} has {bert.num_weights} weights')

Here we define and instantiate the feed forward network that takes as input our BERT embeddings. This network will be used to output the sentence classifications.

In [None]:
# mlp classifier
bert_hidden_size = bert.local_parameters['hidden_size']

mlp = nemo_nlp.SequenceClassifier(hidden_size=bert_hidden_size, 
                                  num_classes=2,
                                  num_layers=2,
                                  log_softmax=False,
                                  dropout=0.1)

loss = nemo.backends.pytorch.common.CrossEntropyLoss()

In [None]:
# Compared to the BERT model, the MLP is tiny.
print(f'MLP has {mlp.num_weights} weights')

# Pipelines

Pipelines are used to define how data will flow the different neural networks. In this case, our data will flow through the BERT network and then the MLP network.

We also have different pipelines for training, validation, and inference data.  

For training data, we want it to be used for optimization so it must be shuffled and we also need to compute the loss.

For validation data, we won't use it for optimization but we want to know the loss.

And for inference data, we only want the final predictions coming from the model.

## Data Layers

In [None]:
use_preproc = True

if pretrained_bert_model == 'bert-base-uncased':
    batch_size = 256
if pretrained_bert_model == 'bert-large-uncased':
    batch_size = 64

if use_preproc:
    train_data = preproc_data_layer.PreprocBertSentenceClassificationDataLayer(
        input_file=f'{data_dir}/preproc/train-sst-2_{pretrained_bert_model}_{max_seq_length}.hdf5',
        shuffle=True,
        num_samples=-1, # lower for dev, -1 for all dataset
        batch_size=batch_size
    )
    val_data = preproc_data_layer.PreprocBertSentenceClassificationDataLayer(
        input_file=f'{data_dir}/preproc/val-sst-2_{pretrained_bert_model}_{max_seq_length}.hdf5',
        shuffle=False,
        num_samples=-1, # lower for dev, -1 for all dataset
        batch_size=batch_size
    )
else:
    train_data = preproc_data_layer.BertSentenceClassificationDataLayer(
        input_file=data_dir + '/my_train.tsv',
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        shuffle=True,
        num_samples=-1, # lower for dev, -1 for all dataset
        batch_size=batch_size
    )
    val_data = preproc_data_layer.BertSentenceClassificationDataLayer(
        input_file=data_dir + '/my_val.tsv',
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        shuffle=False,
        num_samples=-1, # lower for dev, -1 for all dataset
        batch_size=batch_size
    )

In [None]:
train_input, train_token_types, train_attn_mask, train_labels = train_data()
val_input, val_token_types, val_attn_mask, val_labels = val_data()

## BERT Embeddings

In [None]:
train_embeddings = bert(input_ids=train_input,
                        token_type_ids=train_token_types,
                        attention_mask=train_attn_mask)
val_embeddings = bert(input_ids=val_input,
                        token_type_ids=val_token_types,
                        attention_mask=val_attn_mask)

## Inspect BERT Embeddings

If we want to inspect the data as it flows through our neural factory we can use the .infer method.  This method will give us the tensors without performing any optimization.

In [None]:
val_input_tensors = nf.infer(tensors=[val_input])

In [None]:
print(val_input_tensors[0][0][0].shape)

In [None]:
print(val_input_tensors[0][0][0])

In [None]:
%%time
val_embeddings_tensors = nf.infer(tensors=[val_embeddings])

In [None]:
# each word is embedded into bert_hidden_size space
# max_seq_len words are embedded
print(val_embeddings_tensors[0][0][0].shape)

In [None]:
val_embeddings_tensors[0][0].shape

In [None]:
val_embeddings_tensors[0][0][0][:].shape

In [None]:
print(val_embeddings_tensors[0][0][1][:, 0])

## Understanding and Visualizing BERT Embeddings

We are going to look at the BERT embeddings for the words (1-word sentences) in "data/SST-2/positive_negative.tsv". Since the BERT embeddings are 768 dimensional for BERT base and 1024 dimensional for BERT large, we'll first apply TSNE and reduce the embeddings to two dimensions.

In [None]:
spectrum_df = pd.read_csv('data/positive_negative.tsv', delimiter='\t')
print(spectrum_df.sentence.values)

In [None]:
# positive negative spectrum
spectrum_data = nemo_nlp.BertSentenceClassificationDataLayer(
    input_file='data/positive_negative.tsv',
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    shuffle=False,
    num_samples=-1, # lower for dev, -1 for all dataset
    batch_size=batch_size,
    dataset_type=preproc_data_layer.BertSentenceClassificationDataset
)

In [None]:
spectrum_input, spectrum_token_types, spectrum_attn_mask, spectrum_labels = spectrum_data()

In [None]:
spectrum_embeddings = bert(input_ids=spectrum_input,
                        token_type_ids=spectrum_token_types,
                        attention_mask=spectrum_attn_mask)

In [None]:
spectrum_embeddings_tensors = nf.infer(tensors=[spectrum_embeddings])

In [None]:
spectrum_embeddings_tensors[0][0].shape

In [None]:
spectrum_embeddings_tensors[0][0][:,0,:].shape

In [None]:
plt.figure(figsize=(100,100))
plt.imshow(spectrum_embeddings_tensors[0][0][:,0,:].numpy())

In [None]:

spectrum_activations = spectrum_embeddings_tensors[0][0][:,0,:].numpy()
tsne_spectrum = TSNE(n_components=2, perplexity=10, verbose=1, learning_rate=2,
                     random_state=123).fit_transform(spectrum_activations)

fig = plt.figure(figsize=(10,10))
plt.plot(tsne_spectrum[0:11, 0], tsne_spectrum[0:11, 1], 'rx')
plt.plot(tsne_spectrum[11:, 0], tsne_spectrum[11:, 1], 'bo')
for (x,y, label) in zip(tsne_spectrum[0:, 0], tsne_spectrum[0:, 1], spectrum_df.sentence.values.tolist() ):
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

## Training Pipeline 

In order to optimize our network, we need to pass the embeddings through the MLP network and then compute the loss.

In [None]:
train_logits = mlp(hidden_states=train_embeddings)
val_logits = mlp(hidden_states=val_embeddings)

train_loss = loss(logits=train_logits, labels=train_labels)
val_loss = loss(logits=val_logits, labels=val_labels)

## Callbacks

Callbacks are used to record and log metrics and save checkpoints for the training and evaluation. We use callbacks to print to screen and also to tensorboard.




In [None]:
num_epochs = 3

num_gpus = 1

train_data_size = len(train_data)

steps_per_epoch = math.ceil(train_data_size / (batch_size * num_gpus))

train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[train_loss, train_logits],
    print_func=lambda x: nf.logger.info(f'Train loss: {str(np.round(x[0].item(), 3))}'),
    tb_writer=nf.tb_writer,
    get_tb_values=lambda x: [["train_loss", x[0]]],
    step_freq=1)

eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=[val_logits, val_labels],
    user_iter_callback=lambda x, y: eval_iter_callback(
        x, y, val_data),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x, f'{nf.work_dir}/graphs'),
    tb_writer=nf.tb_writer,
    eval_epoch=1,
    eval_step=steps_per_epoch)

# Create callback to save checkpoints
ckpt_callback = nemo.core.CheckpointCallback(
    folder=nf.checkpoint_dir,
    epoch_freq=1,
    step_freq=-1,
    checkpoints_to_keep=num_epochs)

In [None]:
lr_policy_fn = get_lr_policy('WarmupAnnealing',
                             total_steps=num_epochs * steps_per_epoch,
                             warmup_ratio=0.1)

In [None]:
%%time
nf.train(tensors_to_optimize=[train_loss],
         callbacks=[train_callback, eval_callback, ckpt_callback],
         lr_policy=lr_policy_fn,
         optimizer='adam',
         optimization_params={'num_epochs': num_epochs, 'lr': 5e-5})

In [None]:
print(tensorboard_dir)

In [None]:
# # command for distributed training
# time python -m torch.distributed.launch --nproc_per_node=2 sentence_classification.py \
# --train_file $f'{data_dir}/preproc/train-sst-2_{pretrained_bert_model}_{max_seq_length}.hdf5' \
# --eval_file $f'{data_dir}/preproc/train-sst-2_{pretrained_bert_model}_{max_seq_length}.hdf5' \
# --num_gpus 2 \
# --batch_size $batch_size \
# --amp_opt_level O1 \
# --work_dir $f'distributed_logs_{pretrained_bert_model}' \
# --mode train \
# --num_classes 2 \
# --num_samples -1 \
# --num_epochs 1 \
# --preproc