### 1. Load data

In [1]:
from mxnet import autograd, gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
from tqdm import tqdm_notebook

import mxnet as mx
import pandas as pd
import collections
import random
import tarfile
import d2l

In [2]:
train_data = []
with open("/home/mj/naver_movie_sentiment/ratings_train.txt", 'r') as fp:
    for line in fp:
        ids, doc, label = line.split("\t")
        train_data.append([doc, label.strip()])

train_data = pd.DataFrame(train_data[1:], columns=['Doc', 'label'])

In [3]:
test_data = []
with open("/home/mj/naver_movie_sentiment/ratings_test.txt", 'r') as fp:
    for line in fp:
        ids, doc, label = line.split("\t")
        test_data.append([doc, label.strip()])

test_data = pd.DataFrame(test_data[1:], columns=['Doc', 'label'])

### 2. Preprocess

In [4]:
def get_tokenized_corpus(corpus):
    def tokenize(sent):
        return sent.split(" ")
    return [tokenize(review) for review in corpus]

In [5]:
def get_vocab(corpus):
    tokenized_data = get_tokenized_corpus(corpus)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return text.vocab.Vocabulary(counter, min_freq=5)

In [6]:
vocab = get_vocab(train_data['Doc'])
'# Words in vocab:', len(vocab)

('# Words in vocab:', 24005)

In [7]:
def preprocess_corpus(data, vocab):
    max_l = 200
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
    
    tokenized_data = get_tokenized_corpus(data['Doc'])
    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
    labels = [int(s) for s in data['label']]
    return features, labels

### 3. Create Data Iterator

In [8]:
batch_size = 128
train_set = gdata.ArrayDataset(*preprocess_corpus(train_data, vocab))
test_set = gdata.ArrayDataset(*preprocess_corpus(test_data, vocab))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

In [9]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X (128, 200) y (128,)


('#batches:', 1172)

### 4. Construct RNN model

In [10]:
class BiRNN(nn.Block):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        # embedding matrix
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        self.Encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, bidirectional=True, input_size=embed_size)
        self.Decoder = nn.Dense(2)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs.T)
        
        states = self.Encoder(embeddings)
        
        encoding = nd.concat(states[0], states[-1])
        outputs = self.Decoder(encoding)
        return outputs

In [11]:
# Use GPU if available
if len(mx.test_utils.list_gpus()) != 0:
    ctx=mx.gpu()
else:
    ctx=mx.cpu()

In [12]:
embed_size, num_hiddens, num_layers = 100, 300, 1
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)

### Training

In [13]:
lr, num_epochs = 0.001, 5
trainer = gluon.Trainer(net.collect_params(),'adam', {'learning_rate' : lr})
loss_fn = gloss.SoftmaxCrossEntropyLoss()
eval_metric = mx.metric.Accuracy()

In [14]:
for epoch in range(num_epochs):
    
    # training
    loss_sum, n = 0.0, 0
    for batch in tqdm_notebook(train_iter, desc='training'):
        tokens, label = [data.as_in_context(ctx) for data in batch]
        
        with autograd.record():
            pred = net(tokens)
            loss = loss_fn(pred, label)
            loss.backward()
        
        trainer.step(batch_size=batch_size)
        loss_sum += loss.sum().asscalar()
        n += loss.size
    print('epoch : %d, loss : %.3f' %(epoch+1, loss_sum / n))
    
    # validation
    for batch in tqdm_notebook(test_iter, desc='validation'):
        tokens, label = [data.as_in_context(ctx) for data in batch]
        pred = net(tokens)
        eval_metric.update(preds=pred, labels=label)
    print('epoch : %d, acc : %.3f' %(epoch+1, eval_metric.get()[1]))

HBox(children=(IntProgress(value=0, description='training', max=1172), HTML(value='')))


epoch : 1, loss : 0.481


HBox(children=(IntProgress(value=0, description='validation', max=391), HTML(value='')))


epoch : 1, acc : 0.776


HBox(children=(IntProgress(value=0, description='training', max=1172), HTML(value='')))


epoch : 2, loss : 0.371


HBox(children=(IntProgress(value=0, description='validation', max=391), HTML(value='')))


epoch : 2, acc : 0.776


HBox(children=(IntProgress(value=0, description='training', max=1172), HTML(value='')))


epoch : 3, loss : 0.320


HBox(children=(IntProgress(value=0, description='validation', max=391), HTML(value='')))


epoch : 3, acc : 0.775


HBox(children=(IntProgress(value=0, description='training', max=1172), HTML(value='')))


epoch : 4, loss : 0.276


HBox(children=(IntProgress(value=0, description='validation', max=391), HTML(value='')))


epoch : 4, acc : 0.773


HBox(children=(IntProgress(value=0, description='training', max=1172), HTML(value='')))


epoch : 5, loss : 0.242


HBox(children=(IntProgress(value=0, description='validation', max=391), HTML(value='')))


epoch : 5, acc : 0.770


### Test

In [15]:
def predict_sentiment(net, vocab, sentence):
    sentence = nd.array(vocab.to_indices(sentence), ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label.asscalar() == 1 else 'negative'

In [16]:
review = '이 영화는 아벤져수인지 어벤져스인지 하는 미제앞잡이 벌레 무리들에 대한 영화가 아닌 \
우주를 진정으로 사랑하는 마음씨와 노블리스 오블리쥬의 정신을 가진 퓨어블러드 타노스가\
선지자의 깨우침을 알지 못 한 우매하고 멍청한 어벤져스의 방해를 물리치고\
자신의 피와 땀을 바쳐 우주의 평화를 이룩하려 했던 타노스의 두번째 단독 영화이다'
review = review.split(' ')

In [17]:
predict_sentiment(net, vocab, review)

'positive'