In [None]:
import nemo
import nemo.collections.nlp as nemo_nlp
from nemo.collections.nlp.data.datasets import BertTextClassificationDataset
from nemo.collections.nlp.nm.data_layers.text_classification_datalayer import BertTextClassificationDataLayer
from nemo.collections.nlp.nm.trainables import SequenceClassifier

from nemo.backends.pytorch.common import CrossEntropyLossNM
from nemo.utils.lr_policies import get_lr_policy
from nemo.collections.nlp.callbacks.text_classification_callback import eval_iter_callback, eval_epochs_done_callback

import os
import json
import math
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = -1

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
%matplotlib inline

import torch

## Data Explore

The SST-2 dataset https://nlp.stanford.edu/sentiment/index.html is a standard benchmark for sentence classification and is part of the GLUE Benchmark: https://gluebenchmark.com/tasks. Please download and unzip the SST-2 dataset from GLUE.

In [None]:
WORK_DIR = 'logs'
DATA_DIR = 'data/SST-2'

# To use mixed precision, set AMP_OPTIMIZATION_LEVEL to 'O1' or 'O2',
# to train without mixed precision, set it to 'O0'.
AMP_OPTIMIZATION_LEVEL = 'O1'
PRETRAINED_BERT_MODEL = 'bert-base-uncased'
MAX_SEQ_LEN = 64 # we will pad with 0's shorter sentences and truncate longer
BATCH_SIZE = 256 # 64 for 'bert-large-uncased'

In [None]:
df = pd.read_csv(DATA_DIR + '/train.tsv', sep='\t')
test_df = pd.read_csv(DATA_DIR + '/test.tsv', sep='\t')

In [None]:
df.head()

In [None]:
test_df.head()

The dataset comes with a train file (labeled) and a test file (not labeled).  We will use part of the train file for model validation

In [None]:
# Split train to train and val and save to disk
np.random.seed(123)
train_mask = np.random.rand((len(df))) < .8
train_df = df[train_mask]
val_df = df[~train_mask]

In order to take advantage of NeMo's pre-built sentence classification data layer, the data should be formatted as "sentence\tlabel" (sentence tab label).

In [None]:
# We will add a label column with all 0's (but they will not be used for anything).
test_df['label'] = 0

In [None]:
test_df = test_df[['sentence', 'label']]

In [None]:
test_df.head()

In [None]:
# Save new train, val, and test to disk
SPLIT_DATA_DIR = os.path.join(DATA_DIR, 'split')

os.makedirs(SPLIT_DATA_DIR, exist_ok=True)

train_df.to_csv(os.path.join(SPLIT_DATA_DIR, 'train.tsv'), sep='\t', index=False)
val_df.to_csv(os.path.join(SPLIT_DATA_DIR, 'eval.tsv'), sep='\t', index=False)
test_df.to_csv(os.path.join(SPLIT_DATA_DIR, 'test.tsv'), sep='\t', index=False)

## Neural Modules

In NeMo, everything is a Neural Module. Neural modules abstract data and neural network architectures. Where a deep learning framework like PyTorch or Tensorflow is used to combine neural network layers to create a neural network. 
NeMo is used to combine data and neural networks to create AI applications.
The Neural Module Factory will then manage the neural modules, taking care to flow data through the neural modules, and is also responsible for training (including mixed precision and distributed), logging, and inference.

In [None]:
# instantiate the neural module factory
nf = nemo.core.NeuralModuleFactory(log_dir=WORK_DIR,
                                   create_tb_writer=True,
                                   add_time_to_log_dir=False,
                                   optimization_level=AMP_OPTIMIZATION_LEVEL)

Pre-trained models will be automatically downloaded and cached.

In [None]:
# Pre-trained BERT
bert = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)
tokenizer = nemo_nlp.data.NemoBertTokenizer(PRETRAINED_BERT_MODEL)

Note here that the BERT models we are working with are massive. This gives our models a large capacity for learning that is needed to understand the nuance and complexity of natural language.

In [None]:
print(f'{PRETRAINED_BERT_MODEL} has {bert.num_weights} weights')

Here we define and instantiate the feed forward network that takes as input our BERT embeddings. This network will be used to output the sentence classifications.

In [None]:
# mlp classifier
bert_hidden_size = bert.local_parameters['hidden_size']

mlp = SequenceClassifier(hidden_size=bert_hidden_size, 
                         num_classes=2,
                         num_layers=2,
                         log_softmax=False,
                         dropout=0.1)

loss = CrossEntropyLossNM()

In [None]:
# Compared to the BERT model, the MLP is tiny.
print(f'MLP has {mlp.num_weights} weights')

# Pipelines

Pipelines are used to define how data will flow the different neural networks. In this case, our data will flow through the BERT network and then the MLP network.

We also have different pipelines for training, validation, and inference data.  

For training data, we want it to be used for optimization so it must be shuffled and we also need to compute the loss.

For validation data, we won't use it for optimization but we want to know the loss.

And for inference data, we only want the final predictions coming from the model.

## Data Layers

We can gain a lot of efficiency by saving the tokenized data to disk. For future model runs we then don't need to tokenize every time.

In [None]:
USE_CACHE = True

train_data = BertTextClassificationDataLayer(input_file=os.path.join(SPLIT_DATA_DIR, 'train.tsv'),
                                             tokenizer=tokenizer,
                                             max_seq_length=MAX_SEQ_LEN,
                                             shuffle=True,
                                             batch_size=BATCH_SIZE,
                                             use_cache=USE_CACHE)

val_data = BertTextClassificationDataLayer(input_file=os.path.join(SPLIT_DATA_DIR, 'eval.tsv'),
                                           tokenizer=tokenizer,
                                           max_seq_length=MAX_SEQ_LEN,
                                           batch_size=BATCH_SIZE,
                                           use_cache=USE_CACHE)

In [None]:
train_input, train_token_types, train_attn_mask, train_labels = train_data()
val_input, val_token_types, val_attn_mask, val_labels = val_data()

## BERT Embeddings

In [None]:
train_embeddings = bert(input_ids=train_input,
                        token_type_ids=train_token_types,
                        attention_mask=train_attn_mask)
val_embeddings = bert(input_ids=val_input,
                      token_type_ids=val_token_types,
                      attention_mask=val_attn_mask)

## Inspect BERT Embeddings

If we want to inspect the data as it flows through our neural factory we can use the .infer method.  This method will give us the tensors without performing any optimization.

In [None]:
val_input_tensors = nf.infer(tensors=[val_input])

In [None]:
print(val_input_tensors[0][0][0])

In [None]:
%%time
val_embeddings_tensors = nf.infer(tensors=[val_embeddings])

In [None]:
# each word is embedded into bert_hidden_size space
# shape: BATCH_SIZE * MAX_SEQ_LEN * bert_hidden_size
val_embeddings_tensors[0][0].shape

In [None]:
print(val_embeddings_tensors[0][0][1][:, 0])

## Understanding and Visualizing BERT Embeddings

We are going to look at the BERT embeddings for the words (1-word sentences) in "SPLIT_DATA_DIR/positive_negative.tsv". Since the BERT embeddings are 768 dimensional for BERT base and 1024 dimensional for BERT large, we'll first apply TSNE and reduce the embeddings to two dimensions.

In [None]:
spectrum_words = ['abysmal', 'apalling', 'dreadful', 'awful', 'terrible',
                  'very bad', 'really bad', 'rubbish', 'unsatisfactory',
                  'bad', 'poor', 'great', 'really good', 'very good', 'awesome'
                  'fantastic', 'superb', 'brilliant', 'incredible', 'excellent'
                  'outstanding', 'perfect']

spectrum_file = os.path.join(SPLIT_DATA_DIR, 'positive_negative.tsv')
with open(spectrum_file, 'w+') as f:
    f.write('sentence\tlabel')
    for word in spectrum_words:
        f.write('\n' + word + '\t0')

In [None]:
spectrum_df = pd.read_csv(spectrum_file, delimiter='\t')
print(spectrum_df.head())

In [None]:
# positive negative spectrum
spectrum_data = BertTextClassificationDataLayer(input_file=spectrum_file,
                                                tokenizer=tokenizer,
                                                max_seq_length=MAX_SEQ_LEN,
                                                batch_size=BATCH_SIZE)

In [None]:
spectrum_input, spectrum_token_types, spectrum_attn_mask, spectrum_labels = spectrum_data()

In [None]:
spectrum_embeddings = bert(input_ids=spectrum_input,
                           token_type_ids=spectrum_token_types,
                           attention_mask=spectrum_attn_mask)

In [None]:
spectrum_embeddings_tensors = nf.infer(tensors=[spectrum_embeddings])

In [None]:
spectrum_embeddings_tensors[0][0].shape

In [None]:
plt.figure(figsize=(100,100))
plt.imshow(spectrum_embeddings_tensors[0][0][:,0,:].numpy())

In [None]:
spectrum_activations = spectrum_embeddings_tensors[0][0][:,0,:].numpy()
tsne_spectrum = TSNE(n_components=2, perplexity=10, verbose=1, learning_rate=2,
                     random_state=123).fit_transform(spectrum_activations)

fig = plt.figure(figsize=(10,10))
plt.plot(tsne_spectrum[0:11, 0], tsne_spectrum[0:11, 1], 'rx')
plt.plot(tsne_spectrum[11:, 0], tsne_spectrum[11:, 1], 'bo')
for (x,y, label) in zip(tsne_spectrum[0:, 0], tsne_spectrum[0:, 1], spectrum_df.sentence.values.tolist() ):
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

## Training Pipeline 

In order to optimize our network, we need to pass the embeddings through the MLP network and then compute the loss.

In [None]:
train_logits = mlp(hidden_states=train_embeddings)
val_logits = mlp(hidden_states=val_embeddings)

train_loss = loss(logits=train_logits, labels=train_labels)
val_loss = loss(logits=val_logits, labels=val_labels)

## Callbacks

Callbacks are used to record and log metrics and save checkpoints for the training and evaluation. We use callbacks to print to screen and also to tensorboard.




In [None]:
NUM_EPOCHS = 3
NUM_GPUS = 1
LEARNING_RATE = 5e-5
OPTIMIZER = 'adam'

train_data_size = len(train_data)
steps_per_epoch = math.ceil(train_data_size / (BATCH_SIZE * NUM_GPUS))

train_callback = nemo.core.SimpleLossLoggerCallback(tensors=[train_loss, train_logits],
                            print_func=lambda x:nemo.logging.info(f'Train loss: {str(np.round(x[0].item(), 3))}'),
                            tb_writer=nf.tb_writer,
                            get_tb_values=lambda x: [["train_loss", x[0]]],
                            step_freq=steps_per_epoch)

eval_callback = nemo.core.EvaluatorCallback(eval_tensors=[val_logits, val_labels],
                                            user_iter_callback=lambda x, y: eval_iter_callback(x, y, val_data),
                                            user_epochs_done_callback=lambda x:
                                                eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'),
                                            tb_writer=nf.tb_writer,
                                            eval_step=steps_per_epoch)

# Create callback to save checkpoints
ckpt_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir,
                                             epoch_freq=1)

In [None]:
lr_policy_fn = get_lr_policy('WarmupAnnealing',
                             total_steps=NUM_EPOCHS * steps_per_epoch,
                             warmup_ratio=0.1)

In [None]:
%%time
nf.train(tensors_to_optimize=[train_loss],
         callbacks=[train_callback, eval_callback, ckpt_callback],
         lr_policy=lr_policy_fn,
         optimizer=OPTIMIZER,
         optimization_params={'num_epochs': NUM_EPOCHS, 'lr': LEARNING_RATE})

## Multi-Gpu Training

RESTART KERNEL BEFORE RUNNING THE MULTI-GPU TRAINING

In [None]:
%%time
num_gpus = 4
!python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS text_classification_with_bert.py \
--pretrained_model_name $PRETRAINED_BERT_MODEL \
--data_dir $SPLIT_DATA_DIR \
--dataset_name 'sst-2' \
--train_file_prefix 'train' \
--eval_file_prefix 'eval' \
--use_cache \
--batch_size 64 \
--max_seq_length 64 \
--num_gpus $NUM_GPUS \
--num_epochs $NUM_EPOCHS \
--amp_opt_level $AMP_OPTIMIZATION_LEVEL \
--work_dir $WORK_DIR

## Inference Pipeline

For inference we instantiate the same neural modules but now we will be using the checkpoints that we just learned.

In [None]:
test_data = BertTextClassificationDataLayer(input_file=os.path.join(SPLIT_DATA_DIR, 'test.tsv'),
                                            tokenizer=tokenizer,
                                            max_seq_length=MAX_SEQ_LEN,
                                            batch_size=BATCH_SIZE)

In [None]:
test_input, test_token_types, test_attn_mask, _ = test_data()
test_embeddings = bert(input_ids=test_input,
                        token_type_ids=test_token_types,
                        attention_mask=test_attn_mask)
test_logits = mlp(hidden_states=test_embeddings)

In [None]:
%%time
test_logits_tensors = nf.infer(tensors=[test_logits])

In [None]:
test_probs = torch.nn.functional.softmax(torch.cat(test_logits_tensors[0])).numpy()[:, 1] 

In [None]:
test_df = pd.read_csv(os.path.join(SPLIT_DATA_DIR, 'test.tsv'), sep='\t')

In [None]:
test_df['prob'] = test_probs 
inference_file = os.path.join(SPLIT_DATA_DIR, 'test_inference.tsv')
test_df.to_csv(inference_file, sep='\t', index=False)

In [None]:
def sample_classification(data_path):
    df = pd.read_csv(data_path, sep='\t')
    sample = df.sample()
    sentence = sample.sentence.values[0]
    prob = sample.prob.values[0]
    result = f'{sentence} | {prob}'
    return result

In [None]:
num_samples = 10
for _ in range(num_samples):
    print(sample_classification(inference_file))

## Inference Results:
the film is just a big , gorgeous , mind-blowing , breath-taking mess . | 0.2738656

a sensual performance from abbass buoys the flimsy story , but her inner journey is largely unexplored and we 're left wondering about this exotic-looking woman whose emotional depths are only hinted at . | 0.48260054

## Single sentence classification

In [None]:
def classify_sentence(nf, tokenizer, bert, mlp, sentence):
    sentence = sentence.lower()
    tmp_file = "/tmp/tmp_sentence.tsv"
    with open(tmp_file, 'w+') as tmp_tsv:
        header = 'sentence\tlabel\n'
        line = sentence + '\t0\n'
        tmp_tsv.writelines([header, line])

    tmp_data = BertTextClassificationDataLayer(input_file=tmp_file,
                                               tokenizer=tokenizer,
                                               max_seq_length=128,
                                               batch_size=1)
    
    tmp_input, tmp_token_types, tmp_attn_mask, _ = tmp_data()
    tmp_embeddings = bert(input_ids=tmp_input,
                          token_type_ids=tmp_token_types,
                          attention_mask=tmp_attn_mask)
    tmp_logits = mlp(hidden_states=tmp_embeddings)
    tmp_logits_tensors = nf.infer(tensors=[tmp_logits, tmp_embeddings])
    tmp_probs = torch.nn.functional.softmax(torch.cat(tmp_logits_tensors[0])).numpy()[:, 1] 
    print(f'{sentence} | {tmp_probs[0]}')

In [None]:
sentences = ['point break is the best movie of all time',
             'the movie was a wonderful exercise in understanding the struggles of native americans',
             'the performance of diego luna had me excited and annoyed at the same time',
             'matt damon is the only good thing about this film']

for sentence in sentences:
    classify_sentence(nf, tokenizer, bert, mlp, sentence)

## Understanding and Visualizing BERT Embeddings

Now that we've fine-tuned our BERT model, let's see if the word embeddings have changed.

In [None]:
spectrum_embeddings = bert(input_ids=spectrum_input,
                           token_type_ids=spectrum_token_types,
                           attention_mask=spectrum_attn_mask)

spectrum_embeddings_tensors = nf.infer(tensors=[spectrum_embeddings])

plt.figure(figsize=(100,100))
plt.imshow(spectrum_embeddings_tensors[0][0][:,0,:].numpy())

In [None]:
spectrum_activations = spectrum_embeddings_tensors[0][0][:,0,:].numpy()
tsne_spectrum = TSNE(n_components=2, perplexity=10, verbose=1, learning_rate=2,
                     random_state=123).fit_transform(spectrum_activations)

fig = plt.figure(figsize=(10,10))
plt.plot(tsne_spectrum[0:11, 0], tsne_spectrum[0:11, 1], 'rx')
plt.plot(tsne_spectrum[11:, 0], tsne_spectrum[11:, 1], 'bo')
for (x,y, label) in zip(tsne_spectrum[0:, 0], tsne_spectrum[0:, 1], spectrum_df.sentence.values.tolist() ):
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center