# Text Sentiment Classification: Using Recurrent Neural Networks with Self-Attention

In [1]:
import d2l
import mxnet as mx
from mxnet import gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
import gluonnlp as nlp
import os
import tarfile

## Text Sentiment Classification Data

###  Reading Data

In [2]:
vocab, train_iter, test_iter = d2l.load_data_imdb(batch_size=64)

Print the shape of the first mini-batch of data and the number of mini-batches in the training set.

In [3]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'# batches:', len(train_iter)

X (64, 500) y (64,)


('# batches:', 391)


## Use a Recurrent Neural Network Model with Self Attention

In [4]:
class SelfAttention(nn.HybridBlock):
    def __init__(self, num_atention_units, num_attention_channels, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        with self.name_scope():
            self.proj_query = nn.Dense(num_atention_units, activation='tanh', flatten=False)
            self.parametric_key = nn.Dense(num_attention_channels, activation=None, flatten=False)

    def hybrid_forward(self, F, query):
        # query shape: [batch_size, seq_len, embedding_width]
        # projected query shape: [batch_size, seq_len, num_atention_units]
        query = self.proj_query(query)
        # scores shape: [batch_size, seq_len, attention_channels]
        scores = self.parametric_key(query)

        # attention_weights shape: [batch_size,  att_hops, seq_len]
        attention_weights = F.softmax(F.transpose(scores, axes=(0, 2, 1)), axis=-1)
        # output shape [batch_size, att_hops, embedding_width]
        output = F.batch_dot(attention_weights, query)

        return output, attention_weights

In [5]:
class AttentiveBiLSTM(nn.HybridBlock):
    """Lin et al.: A Structured Self-Attentive Sentence Embedding. ICLR 2017"""
    def __init__(self, vocab_len, embed_size, num_hiddens, num_layers,
                 num_attention_units, num_attention_channels, **kwargs):
        super(AttentiveBiLSTM, self).__init__(**kwargs)
        with self.name_scope():
            self.embedding = nn.Embedding(vocab_len, embed_size)
            self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, bidirectional=True)
            self.attention = SelfAttention(num_attention_units, num_attention_channels)
            self.decoder = nn.Dense(2)

    def hybrid_forward(self, F, inputs):
        # The shape of inputs is (batch size, number of words). Because LSTM
        # needs to use sequence as the first dimension, the input is
        # transformed and the word feature is then extracted. The output shape
        # is (number of words, batch size, word vector dimension).
        embeddings = self.embedding(F.transpose(inputs))
        # The shape of states is (number of words, batch size, 2 * number of
        # hidden units).
        states = self.encoder(embeddings)
        context_vec, att_weights = self.attention(F.transpose(states, (1, 0, 2)))
        
        outputs = self.decoder(F.flatten(context_vec))
        return outputs

Create the model.

In [6]:
embed_size, num_hiddens, num_layers, ctx = 100, 100, 2, d2l.try_all_gpus()
natt_unit, natt_channel = 500, 2

net = AttentiveBiLSTM(len(vocab), embed_size, num_hiddens, num_layers,
                            natt_unit, natt_channel)
net.initialize(init.Xavier(), ctx=ctx)
net.hybridize()

In [7]:
net

AttentiveBiLSTM(
  (embedding): Embedding(49339 -> 100, float32)
  (encoder): LSTM(None -> 100, TNC, num_layers=2, bidirectional)
  (attention): SelfAttention(
    (proj_query): Dense(None -> 500, Activation(tanh))
    (parametric_key): Dense(None -> 2, linear)
  )
  (decoder): Dense(None -> 2, linear)
)

### Load Pre-trained Word Vectors


In [8]:
glove_embedding = nlp.embedding.create('glove', source='glove.6B.100d')
idx_to_vec = glove_embedding[vocab.idx_to_token]
idx_to_vec.shape

(49339, 100)

Use these word vectors as feature vectors for each word in the reviews. 

In [9]:
net.embedding.weight.set_data(idx_to_vec)
net.embedding.collect_params().setattr('grad_req', 'null')

### Train and Evaluate the Model



In [10]:
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)

training on [gpu(0)]
epoch 1, loss 0.4950, train acc 0.751, test acc 0.831, time 42.9 sec
epoch 2, loss 0.3545, train acc 0.846, test acc 0.854, time 42.8 sec
epoch 3, loss 0.3073, train acc 0.870, test acc 0.868, time 42.7 sec
epoch 4, loss 0.2863, train acc 0.880, test acc 0.872, time 42.5 sec
epoch 5, loss 0.2465, train acc 0.899, test acc 0.873, time 42.8 sec


Define the prediction function.

In [11]:
def predict_sentiment(net, vocab, sentence):
    sentence = nd.array(vocab[sentence.split()], ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label.asscalar() == 1 else 'negative'

Then, use the trained model to classify the sentiments of two simple sentences.

In [12]:
predict_sentiment(net, vocab, 'this movie is so great')

'positive'

In [13]:
predict_sentiment(net, vocab, 'this movie is so bad')

'negative'