In [2]:
!pip install mxboard
!pip install gluonnlp --pre

Collecting mxboard
  Using cached https://files.pythonhosted.org/packages/3e/e4/57f6884c39b471c8fd446dc59998045ceab1c9ebe4a6091c953d97a60934/mxboard-0.1.0-py3-none-any.whl
Installing collected packages: mxboard
Successfully installed mxboard-0.1.0


# Imports

In [3]:
import random
import time
import multiprocessing as mp
import numpy as np

import mxnet as mx
from mxnet import nd, gluon, autograd

import gluonnlp as nlp

# Training a Sentiment Analysis Model 

## Objective
- Using the preprocessing we performed previously we can then use pretrained models to train a sentiment analysis on IMDB dataset

# Hyperparameters & Environment

In [4]:
random.seed(123)
np.random.seed(123)
mx.random.seed(123)
max_gpus = 8
num_gpus = min(max_gpus, mx.context.num_gpus())
dropout = 0
language_model_name = 'standard_lstm_lm_200'
pretrained = True
learning_rate = 0.005 * num_gpus
batch_size = 16 * num_gpus
bucket_num = 10
bucket_ratio = 0.2
epochs = 1
grad_clip = None
log_interval = 100


ctx = [mx.gpu(i) for i in range(num_gpus)]

# Building the Model
![](https://gluon-nlp.mxnet.io/_images/samodel-v3.png)


In [5]:

######################################### Model #########################################
class SentimentNet(gluon.Block):
    def __init__(self, embedding_block, encoder_block, dropout,
                 prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = embedding_block
            self.encoder = encoder_block
            self.out_layer = gluon.nn.HybridSequential()
            with self.out_layer.name_scope():
                self.out_layer.add(gluon.nn.Dropout(dropout))
                self.out_layer.add(gluon.nn.Dense(1, flatten=False))

    def forward(self, data, valid_length):
        encoded = self.encoder(nd.Dropout(self.embedding(data),
                                          0.2, axes=(0,)))  # Shape(T, N, C)
        # Zero out the values with position exceeding the valid length.
        masked_encoded = nd.SequenceMask(encoded,
                                         sequence_length=valid_length,
                                         use_sequence_length=True)
        agg_state = nd.broadcast_div(nd.sum(masked_encoded, axis=0),
                                     nd.expand_dims(valid_length, axis=1))
        out = self.out_layer(agg_state)
        return out


- ```python mx.nd.array.SequenceMask(data, sequence_length, use_sequence_length, value=, out, name, **kwargs)```:
Sets all elements outside the sequence to a constant value.
- ```python mxnet.ndarray.broadcast_div(lhs, rhs, out, name, **kwargs)```: Returns element-wise division of the input arrays with broadcasting.


In [6]:
# Model
lm_model, vocab = nlp.model.get_model(name=language_model_name,
                                      dataset_name='wikitext-2',
                                      pretrained=pretrained,
                                      ctx=ctx,
                                      dropout=dropout)
net = SentimentNet(embedding_block=lm_model.embedding,
                   encoder_block=lm_model.encoder,
                   dropout=dropout)
net.out_layer.initialize(mx.init.Xavier(), ctx=ctx)
net.hybridize()
print("\n####################### MODEL #######################\n")
print(net)




####################### MODEL #######################

SentimentNet(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 200, float32)
  )
  (encoder): LSTM(200 -> 200, TNC, num_layers=2)
  (out_layer): HybridSequential(
    (0): Dropout(p = 0, axes=())
    (1): Dense(None -> 1, linear)
  )
)


# Data Pipeline

In [12]:
######################################### Data #########################################
# train_dataset and test_dataset are both SimpleDataset objects,
# which is a wrapper for lists and arrays.
train_dataset, test_dataset = [nlp.data.IMDB(segment=segment)
                               for segment in ('train', 'test')]
print("Tokenize using spaCy...")
# tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')
# length_clip takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(500)

def preprocess(x):
    data, label = x
    # In the labeled train/test sets, a negative review has a score <= 4
    # out of 10, and a positive review has a score >= 7 out of 10. Thus
    # reviews with more neutral ratings are not included in the train/test
    # sets. We labeled a negative review whose score <= 4 as 0, and a
    # positive review whose score >= 7 as 1. As the neural ratings are not
    # included in the datasets, we can simply use 5 as our threshold.
    label = int(label > 5)
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data, label, float(len(data))

Tokenize using spaCy...


In [13]:
def get_length(x):
    return x[2]

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset, lengths

In [14]:
#Data
train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
print("\n####################### DATA #######################\n")
print(train_dataset[0])


Done! Tokenizing Time=6.09s, #Sentences=25000
Done! Tokenizing Time=6.08s, #Sentences=25000

####################### DATA #######################

([0, 1409, 24, 9, 13139, 1717, 4, 61, 1117, 27, 2, 152, 62, 16, 98, 64, 2170, 80, 391, 180, 3, 91, 16, 11, 21820, 11, 4, 1196, 1230, 104, 7, 2, 2632, 13392, 409, 706, 8, 1907, 15, 0, 1409, 17, 7922, 24, 217, 3430, 8, 2564, 84, 24, 11, 21820, 11, 4, 13, 20391, 8, 3660, 7863, 3, 2, 0, 914, 55, 110, 594, 455, 109, 38, 18190, 5232, 57, 0, 3, 2, 0, 5, 2, 1147, 1309, 3, 68, 16731, 706, 5, 2, 1206, 70, 2561, 6, 38, 914, 4, 274, 70, 693, 2, 138, 7, 33, 9, 1838, 5718, 1650, 8, 3685, 197, 2, 391, 3, 70, 1297, 3163, 0, 27, 0, 1409, 4, 76, 3473, 220, 43, 0, 43, 70, 1921, 1914, 8, 0, 49, 5, 1903, 5232, 4, 0, 43, 17622, 8, 0, 1409, 4, 70, 9041, 15, 125, 3233, 5, 750, 507, 1229, 15, 0, 1409, 24, 754, 0, 4, 2493, 9, 18214, 15, 29, 24, 0, 365], 1, 165.0)


# Training

In [15]:
######################################### Train #########################################
def evaluate(net, dataloader, ctx):
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    print('Begin Testing...')
    for i, (data, label, valid_length) in enumerate(dataloader):
        data = mx.nd.transpose(data.as_in_context(ctx))
        valid_length = valid_length.as_in_context(ctx).astype(np.float32)
        label = label.as_in_context(ctx)
        output = net(data, valid_length)
        L = loss(output, label)
        pred = (output > 0.5).reshape(-1)
        total_L += L.sum().asscalar()
        total_sample_num += label.shape[0]
        total_correct_num += (pred == label).sum().asscalar()
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, len(dataloader),
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    return avg_L, acc


# Data Loaders

```python
def train(net, ctx, epochs):
    trainer = gluon.Trainer(net.collect_params(),
                            'ftml',
                            {'learning_rate': learning_rate})
    loss = gluon.loss.SigmoidBCELoss()

    # Construct the DataLoader
    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0),
                                          nlp.data.batchify.Stack(dtype=np.float32),
                                          nlp.data.batchify.Stack(dtype=np.float32))

    batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
                                                        batch_size=batch_size,
                                                        num_buckets=bucket_num,
                                                        ratio=bucket_ratio,
                                                        shuffle=True)
    print(batch_sampler.stats())
    train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
                                             batch_sampler=batch_sampler,
                                             batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(dataset=test_dataset,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            batchify_fn=batchify_fn)
    parameters = net.collect_params().values()
```

# Training Loop

```python
def train(net, ctx, epochs):
...
    # Training/Testing
    for epoch in range(epochs):
    ...
        for i, (data, label, length) in enumerate(train_dataloader):
            if data.shape[0] > len(ctx):
                # Multi-gpu training.
                data_list, label_list, length_list \
                    = [gluon.utils.split_and_load(x, ctx, batch_axis=0, even_split=False)
                                                           for x in [data, label, length]]
            else:
                data_list = [data.as_in_context(ctx[0])]
                label_list = [label.as_in_context(ctx[0])]
                length_list = [length.as_in_context(ctx[0])]
            ...
            for data, label, valid_length in zip(data_list, label_list, length_list):
                valid_length = valid_length
                with autograd.record():
                    output = net(data.T, valid_length)
                    L = L + loss(output, label).mean().as_in_context(ctx[0])
            L.backward()
            # Clip gradient
            if grad_clip:
                gluon.utils.clip_global_norm([p.grad(x.context)
                                              for p in parameters for x in data_list], grad_clip)
            # Update parameter
            trainer.step(1)
            ...
```

In [16]:
def train(net, ctx, epochs):
    trainer = gluon.Trainer(net.collect_params(),
                            'ftml',
                            {'learning_rate': learning_rate})
    loss = gluon.loss.SigmoidBCELoss()

    # Construct the DataLoader
    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0),
                                          nlp.data.batchify.Stack(dtype=np.float32),
                                          nlp.data.batchify.Stack(dtype=np.float32))

    batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
                                                        batch_size=batch_size,
                                                        num_buckets=bucket_num,
                                                        ratio=bucket_ratio,
                                                        shuffle=True)
    print(batch_sampler.stats())
    train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
                                             batch_sampler=batch_sampler,
                                             batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(dataset=test_dataset,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            batchify_fn=batchify_fn)
    parameters = net.collect_params().values()

    # Training/Testing
    for epoch in range(epochs):
        # Epoch training stats
        start_epoch_time = time.time()
        epoch_L = 0.0
        epoch_sent_num = 0
        epoch_wc = 0
        # Log interval training stats
        start_log_interval_time = time.time()
        log_interval_wc = 0
        log_interval_sent_num = 0
        log_interval_L = 0.0

        for i, (data, label, length) in enumerate(train_dataloader):
            if data.shape[0] > len(ctx):
                # Multi-gpu training.
                data_list, label_list, length_list \
                    = [gluon.utils.split_and_load(x,
                                                  ctx,
                                                  batch_axis=0,
                                                  even_split=False)
                       for x in [data, label, length]]
            else:
                data_list = [data.as_in_context(ctx[0])]
                label_list = [label.as_in_context(ctx[0])]
                length_list = [length.as_in_context(ctx[0])]
            L = 0
            wc = length.sum().asscalar()
            log_interval_wc += wc
            epoch_wc += wc
            log_interval_sent_num += data.shape[1]
            epoch_sent_num += data.shape[1]
            for data, label, valid_length in zip(data_list, label_list, length_list):
                valid_length = valid_length
                with autograd.record():
                    output = net(data.T, valid_length)
                    L = L + loss(output, label).mean().as_in_context(ctx[0])
            L.backward()
            # Clip gradient
            if grad_clip:
                gluon.utils.clip_global_norm([p.grad(x.context)
                                              for p in parameters for x in data_list],
                                             grad_clip)
            # Update parameter
            trainer.step(1)
            log_interval_L += L.asscalar()
            epoch_L += L.asscalar()
            if (i + 1) % log_interval == 0:
                print('[Epoch {} Batch {}/{}] elapsed {:.2f} s, \
                      avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, len(train_dataloader),
                           time.time() - start_log_interval_time,
                           log_interval_L / log_interval_sent_num,
                           log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
                # Clear log interval training stats
                start_log_interval_time = time.time()
                log_interval_wc = 0
                log_interval_sent_num = 0
                log_interval_L = 0
        end_epoch_time = time.time()
        test_avg_L, test_acc = evaluate(net, test_dataloader, ctx[0])
        print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, \
        test avg loss {:.6f}, throughput {:.2f}K wps'.format(
            epoch, epoch_L / epoch_sent_num,
            test_acc, test_avg_L, epoch_wc / 1000 /
                   (end_epoch_time - start_epoch_time)))

In [17]:
#Train
train(net, ctx, epochs)

FixedBucketSampler:
  sample_num=25000, batch_num=392
  key=[59, 108, 157, 206, 255, 304, 353, 402, 451, 500]
  cnt=[590, 1999, 5092, 5102, 3038, 2085, 1477, 1165, 870, 3582]
  batch_size=[108, 64, 64, 64, 64, 64, 64, 64, 64, 64]
[Epoch 0 Batch 100/392] elapsed 5.31 s,                       avg loss 0.008419, throughput 300.98K wps
[Epoch 0 Batch 200/392] elapsed 4.81 s,                       avg loss 0.005679, throughput 332.25K wps
[Epoch 0 Batch 300/392] elapsed 5.00 s,                       avg loss 0.005286, throughput 295.92K wps
Begin Testing...
[Batch 100/391] elapsed 6.30 s
[Batch 200/391] elapsed 6.34 s
[Batch 300/391] elapsed 6.49 s
[Epoch 0] train avg loss 0.006042, test acc 0.83,         test avg loss 0.331884, throughput 317.15K wps


```python
#Train
train(net, ctx, epochs)
```
![](./training_result.png)