# Text Sentiment Classification: Using a Bag of Context Free Embeddings

In [1]:
import d2l
import mxnet as mx
from mxnet import gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
import gluonnlp as nlp
import os
import tarfile

## Text Sentiment Classification Data

###  Reading Data

In [2]:
data_dir = './'
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
fname = gutils.download(url, data_dir)
with tarfile.open(fname, 'r') as f:
    f.extractall(data_dir)

Read the training and test data sets.

In [3]:
def read_imdb(folder='train'):
    data, labels = [], []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_dir, 'aclImdb', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data, test_data = read_imdb('train'), read_imdb('test')
print('# trainings:', len(train_data[0]), '\n# tests:', len(test_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', y, 'review:', x[0:60])

# trainings: 25000 
# tests: 25000
label: 1 review: I couldn't believe the comments made about the movie.<br /><
label: 1 review: Deanna Durbin, then 14 and just under contract to MGM, made 
label: 1 review: ...may seem like an overstatement, but it is not.<br /><br /


### Tokenization and Vocabulary 



In [4]:
def tokenize(sentences):
    return [line.split(' ') for line in sentences]

train_tokens = tokenize(train_data[0])
test_tokens = tokenize(test_data[0])

import itertools
vocab = nlp.Vocab(nlp.data.count_tokens(
    itertools.chain.from_iterable(line for line in train_tokens)),
    min_freq=5)

### Padding to the Same Length


In [5]:
max_len = 500

def pad(x):
    if len(x) > max_len:        
        return x[:max_len]
    else:
        return x + [vocab[vocab.unknown_token]] * (max_len - len(x))
    
train_features = nd.array([pad(vocab[line]) for line in train_tokens])
test_features = nd.array([pad(vocab[line]) for line in test_tokens])

### Create Data Iterator

In [6]:
batch_size = 256
train_set = gdata.ArrayDataset(train_features, train_data[1])
test_set = gdata.ArrayDataset(test_features, test_data[1])
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

Print the shape of the first mini-batch of data and the number of mini-batches in the training set.

In [7]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'# batches:', len(train_iter)

X (256, 500) y (256,)


('# batches:', 98)


## Average Embeddings of a Sentence

In [8]:
class ContinuousBagOfWords(nn.HybridBlock):
    def __init__(self, vocab_size, embed_size, **kwargs):
        super(ContinuousBagOfWords, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.HybridLambda(lambda F, x: F.mean(x, axis=1))
        self.decoder = nn.Dense(2)

    def forward(self, inputs):
        # The shape of inputs is (batch size, number of words).
        embeddings = self.embedding(inputs)
        encoding = self.encoder(embeddings)
        outputs = self.decoder(encoding)
        return outputs

Create a the model.

In [9]:
embed_size, ctx = 100, mx.gpu(0)
net = ContinuousBagOfWords(len(vocab), embed_size)
net.hybridize()
net.initialize(init.Xavier(), ctx=ctx)

### Load Pre-trained Word Vectors


In [10]:
glove_embedding = nlp.embedding.create('glove', source='glove.6B.100d')
idx_to_vec = glove_embedding[vocab.idx_to_token]
idx_to_vec.shape

(49342, 100)

Use these word vectors as feature vectors for each word in the reviews. 

In [11]:
net.embedding.weight.set_data(idx_to_vec)
net.embedding.collect_params().setattr('grad_req', 'null')

### Train and Evaluate the Model



In [12]:
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 0.6678, train acc 0.628, test acc 0.673, time 1.3 sec
epoch 2, loss 0.6292, train acc 0.689, test acc 0.696, time 1.0 sec
epoch 3, loss 0.6079, train acc 0.704, test acc 0.707, time 1.0 sec
epoch 4, loss 0.5911, train acc 0.717, test acc 0.715, time 1.0 sec
epoch 5, loss 0.5783, train acc 0.729, test acc 0.723, time 1.0 sec


Define the prediction function.

In [13]:
def predict_sentiment(net, vocab, sentence):
    sentence = nd.array(vocab[sentence.split()], ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label.asscalar() == 1 else 'negative'

Then, use the trained model to classify the sentiments of two simple sentences.

In [14]:
predict_sentiment(net, vocab, 'this movie is so great')

'positive'

In [15]:
predict_sentiment(net, vocab, 'this movie is so bad')

'negative'