In [36]:
from mxnet import autograd, gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
from tqdm import tqdm_notebook

import mxnet as mx
import collections
import os
import random
import tarfile
import time

In [2]:
import pandas as pd

train_data = pd.read_csv('ratings_train.txt', sep='\t')
train_data = train_data.drop('id', axis=1).values
train_data

array([['아 더빙.. 진짜 짜증나네요 목소리', 0],
       ['흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 1],
       ['너무재밓었다그래서보는것을추천한다', 0],
       ...,
       ['이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?', 0],
       ['청춘 영화의 최고봉.방황과 우울했던 날들의 자화상', 1],
       ['한국 영화 최초로 수간하는 내용이 담긴 영화', 0]], dtype=object)

In [3]:
test_data = pd.read_csv('ratings_test.txt', sep='\t')
test_data = test_data.drop('id', axis=1).values
test_data

array([['굳 ㅋ', 1],
       ['GDNTOPCLASSINTHECLUB', 0],
       ['뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', 0],
       ...,
       ['그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다', 0],
       ['절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네', 0],
       ['마무리는 또 왜이래', 0]], dtype=object)

In [4]:
def get_tokenized_review(data):
    def tokenizer(text):
        try:
            return text.split(' ')
        except:
            return []
    return [tokenizer(review) for review, _ in data]

In [5]:
def get_vocab_review(data, min_freq=3):
    tokenized_data = get_tokenized_review(data)
    if len(tokenized_data) > 0:
        counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return text.vocab.Vocabulary(counter, min_freq=min_freq)

vocab = get_vocab_review(train_data)
'# Words in vocab:', len(vocab)

('# Words in vocab:', 42636)

In [25]:
vocab.token_to_idx

{'<unk>': 0,
 '영화': 1,
 '너무': 2,
 '정말': 3,
 '진짜': 4,
 '이': 5,
 '영화.': 6,
 '왜': 7,
 '더': 8,
 '이런': 9,
 '그냥': 10,
 '수': 11,
 '영화를': 12,
 '잘': 13,
 '다': 14,
 '보고': 15,
 '좀': 16,
 '영화는': 17,
 '그': 18,
 '영화가': 19,
 '본': 20,
 '최고의': 21,
 'ㅋㅋ': 22,
 '내가': 23,
 '없는': 24,
 '이건': 25,
 '이렇게': 26,
 '완전': 27,
 '평점': 28,
 '봤는데': 29,
 '있는': 30,
 '좋은': 31,
 '이거': 32,
 '이게': 33,
 '보는': 34,
 '내': 35,
 '평점이': 36,
 '다시': 37,
 '그리고': 38,
 '참': 39,
 '많이': 40,
 '역시': 41,
 '난': 42,
 '연기': 43,
 '재밌게': 44,
 '한': 45,
 '쓰레기': 46,
 '것': 47,
 '하는': 48,
 '또': 49,
 '아': 50,
 '드라마': 51,
 '꼭': 52,
 '가장': 53,
 '보면': 54,
 'ㅋㅋㅋ': 55,
 '마지막': 56,
 '스토리': 57,
 '무슨': 58,
 'ㅋ': 59,
 '.': 60,
 '없고': 61,
 '볼': 62,
 'ㅠㅠ': 63,
 '같은': 64,
 '없다.': 65,
 '끝까지': 66,
 '대한': 67,
 '안': 68,
 '만든': 69,
 '솔직히': 70,
 '넘': 71,
 '봐도': 72,
 '말이': 73,
 '하지만': 74,
 '전혀': 75,
 '뭐': 76,
 '10점': 77,
 '영화의': 78,
 '내내': 79,
 '아주': 80,
 '최악의': 81,
 '지금': 82,
 '..': 83,
 'ㅎㅎ': 84,
 '할': 85,
 '다른': 86,
 '뭔가': 87,
 '그래도': 88,
 '재미있게': 89,
 '어떻게': 90,
 '많은

In [7]:
def preprocess_review(data, vocab):
    # Make the length of each comment 500 by truncating or adding 0s
    max_l = 500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_review(data)
    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
    labels = nd.array([score for _, score in data])
    return features, labels

In [8]:
batch_size = 64
train_set = gdata.ArrayDataset(*preprocess_review(train_data, vocab))
test_set = gdata.ArrayDataset(*preprocess_review(test_data, vocab))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

In [20]:
for X, y in train_iter:
    print (X)
    print (y)
    break


[[3.9905e+04 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.1271e+04 3.2038e+04 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [5.0000e+00 8.0200e+03 4.9000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [1.6800e+02 2.0400e+03 1.5000e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.3598e+04 1.9460e+03 7.9590e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 8.9630e+03 2.4959e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
<NDArray 64x500 @cpu(0)>

[1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.
 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0.
 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 64 @cpu(0)>


In [9]:
class BiRNN(nn.Block):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # Set Bidirectional to True to get a bidirectional recurrent neural
        # network
        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, bidirectional=True, input_size=embed_size)
        self.decoder = nn.Dense(2)

    def forward(self, inputs):
        # The shape of inputs is (batch size, number of words). Because LSTM
        # needs to use sequence as the first dimension, the input is
        # transformed and the word feature is then extracted. The output shape
        # is (number of words, batch size, word vector dimension).
        embeddings = self.embedding(inputs.T)
        # The shape of states is (number of words, batch size, 2 * number of
        # hidden units).
        states = self.encoder(embeddings)
        # Concatenate the hidden states of the initial time step and final
        # time step to use as the input of the fully connected layer. Its
        # shape is (batch size, 4 * number of hidden units)
        encoding = nd.concat(states[0], states[-1])
        outputs = self.decoder(encoding)
        return outputs

In [12]:
# Use GPU if available
if len(mx.test_utils.list_gpus()) != 0:
    ctx=mx.gpu()
else:
    ctx=mx.cpu()

In [13]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)

In [14]:
#train word2vec using gensim
import gensim

In [15]:
model = gensim.models.Word2Vec(get_tokenized_review(train_data)) #not use special tokenizer

In [26]:
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss_fn = gloss.SoftmaxCrossEntropyLoss()
acc_metric = mx.metric.Accuracy()


In [None]:
# train & val process
for epoch in range(num_epochs):
    print ('epoch :{}'.format(epoch))
    
    #train step
    start, l_sum, n = time.time(), 0.0, 0
    for batch in tqdm_notebook(train_iter, desc='training'):
        tokens, label = [data.as_in_context(ctx) for data in batch]
        with autograd.record():
            pred = net(tokens)
            loss = loss_fn(pred, label)
        loss.backward()
        trainer.step(batch_size)
        l_sum += loss.sum().asscalar()
        n += loss.size
    print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))
    
    #valid step
    for batch in tqdm_notebook(test_iter, desc='validating'):
        tokens, label = [data.as_in_context(ctx) for data in batch]
        pred = net(tokens)
        acc_metric.update(preds = pred, labels = label)
    print('epoch %d, acc %.2f, time %.2fs' % (epoch + 1, acc.get()[1]))
    