In [1]:
import pickle
from pprint import pprint
train_docs = pickle.load(open('senti.traindocs.bin','rb'))
test_docs = pickle.load(open('senti.testdocs.bin','rb'))
pprint(train_docs[0])

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나다/Adjective',
  '목소리/Noun'],
 '0')


In [2]:
import logging
import torch
import io
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab
from tqdm import tqdm

In [3]:
class TextClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, vocab, data, labels):
        super(TextClassificationDataset, self).__init__()
        self._data = data
        self._labels = labels
        self._vocab = vocab
        
    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_labels(self):
        return self._labels

    def get_vocab(self):
        return self._vocab

In [4]:
def my_iterator(data_tuples, ngrams, yield_cls=False):
    for tokens, cls in data_tuples:
        if len(tokens)==0:
            continue
        if yield_cls:
            yield int(cls), ngrams_iterator(tokens, ngrams)
        else:
            yield ngrams_iterator(tokens, ngrams)

In [5]:
def _create_data_from_iterator(vocab, iterator, include_unk):
    data = []
    labels = []
    with tqdm(unit_scale=0, unit='lines') as t:
        for cls, tokens in iterator:
            if include_unk:
                tokens = torch.tensor([vocab[token] for token in tokens])
            else:
                token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token]
                                        for token in tokens]))
                tokens = torch.tensor(token_ids)
            if len(tokens) == 0:
                logging.info('Row contains no tokens.')
            data.append((cls, tokens))
            labels.append(cls)
            t.update(1)
    return data, set(labels)

In [6]:
def _setup_datasets(train_data_tuples, test_data_tuples, ngrams=1, vocab=None, include_unk=False):
    if vocab is None:
        logging.info('Building Vocab based on train_data_tuples')
        vocab = build_vocab_from_iterator(my_iterator(train_data_tuples, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, my_iterator(train_data_tuples, ngrams, yield_cls=True), include_unk)
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, my_iterator(test_data_tuples, ngrams, yield_cls=True), include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))

In [7]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
train_dataset, test_dataset = _setup_datasets(train_docs, test_docs, ngrams=NGRAMS, vocab=None)
#if not os.path.isdir('./.data'):
#	os.mkdir('./.data')
#train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
#    root='./.data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

149995lines [00:02, 67795.15lines/s]
149995lines [00:05, 25471.96lines/s]
49997lines [00:01, 25119.82lines/s]


device(type='cuda')

In [8]:
######################################################################
# Define the model
# ----------------
#
# The model is composed of the
# `EmbeddingBag <https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>`__
# layer and the linear layer (see the figure below). ``nn.EmbeddingBag``
# computes the mean value of a “bag” of embeddings. The text entries here
# have different lengths. ``nn.EmbeddingBag`` requires no padding here
# since the text lengths are saved in offsets.
#
# Additionally, since ``nn.EmbeddingBag`` accumulates the average across
# the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the
# performance and memory efficiency to process a sequence of tensors.
#
# .. image:: ../_static/img/text_sentiment_ngrams_model.png
#

import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [9]:
#

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)


######################################################################
# Functions used to generate batch
# --------------------------------
#

In [10]:
######################################################################
# Since the text entries have different lengths, a custom function
# generate_batch() is used to generate data batches and offsets. The
# function is passed to ``collate_fn`` in ``torch.utils.data.DataLoader``.
# The input to ``collate_fn`` is a list of tensors with the size of
# batch_size, and the ``collate_fn`` function packs them into a
# mini-batch. Pay attention here and make sure that ``collate_fn`` is
# declared as a top level def. This ensures that the function is available
# in each worker.
#
# The text entries in the original data batch input are packed into a list
# and concatenated as a single tensor as the input of ``nn.EmbeddingBag``.
# The offsets is a tensor of delimiters to represent the beginning index
# of the individual sequence in the text tensor. Label is a tensor saving
# the labels of individual text entries.
#

def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    try:
        text = torch.cat(text)
    except Exception as e:
        print(text)
        raise e
    return text, offsets, label

In [11]:
######################################################################
# Define functions to train the model and evaluate results.
# ---------------------------------------------------------
#


######################################################################
# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__
# is recommended for PyTorch users, and it makes data loading in parallel
# easily (a tutorial is
# `here <https://pytorch.org/tutorials/beginner/data_loading_tutorial.html>`__).
# We use ``DataLoader`` here to load AG_NEWS datasets and send it to the
# model for training/validation.
#

from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)


In [12]:
######################################################################
# Split the dataset and run the model
# -----------------------------------
#
# Since the original AG_NEWS has no valid dataset, we split the training
# dataset into train/valid sets with a split ratio of 0.95 (train) and
# 0.05 (valid). Here we use
# `torch.utils.data.dataset.random_split <https://pytorch.org/docs/stable/data.html?highlight=random_split#torch.utils.data.random_split>`__
# function in PyTorch core library.
#
# `CrossEntropyLoss <https://pytorch.org/docs/stable/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
# criterion combines nn.LogSoftmax() and nn.NLLLoss() in a single class.
# It is useful when training a classification problem with C classes.
# `SGD <https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html>`__
# implements stochastic gradient descent method as optimizer. The initial
# learning rate is set to 4.0.
# `StepLR <https://pytorch.org/docs/master/_modules/torch/optim/lr_scheduler.html#StepLR>`__
# is used here to adjust the learning rate through epochs.
#

import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')


Epoch: 1  | time in 0 minutes, 28 seconds
	Loss: 0.0294(train)	|	Acc: 79.3%(train)
	Loss: 0.0001(valid)	|	Acc: 78.8%(valid)
Epoch: 2  | time in 0 minutes, 31 seconds
	Loss: 0.0218(train)	|	Acc: 86.4%(train)
	Loss: 0.0000(valid)	|	Acc: 83.6%(valid)
Epoch: 3  | time in 0 minutes, 30 seconds
	Loss: 0.0182(train)	|	Acc: 89.5%(train)
	Loss: 0.0000(valid)	|	Acc: 85.2%(valid)
Epoch: 4  | time in 0 minutes, 30 seconds
	Loss: 0.0157(train)	|	Acc: 91.6%(train)
	Loss: 0.0000(valid)	|	Acc: 85.7%(valid)
Epoch: 5  | time in 0 minutes, 29 seconds
	Loss: 0.0137(train)	|	Acc: 93.2%(train)
	Loss: 0.0000(valid)	|	Acc: 85.6%(valid)


In [13]:
######################################################################
# Evaluate the model with test dataset
# ------------------------------------
#

print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')



Checking the results of test dataset...
	Loss: 0.0000(test)	|	Acc: 85.1%(test)


In [14]:
from konlpy.tag import Okt
pos_tagger = Okt()
def ko_tokenizer(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [15]:
movie_comment = "영화 더럽게 재미없다. 돈 아까움.."
pprint(ko_tokenizer(movie_comment))

['영화/Noun',
 '더럽다/Adjective',
 '재미없다/Adjective',
 './Punctuation',
 '돈/Noun',
 '아깝다/Adjective',
 '움/Noun',
 '../Punctuation']


In [16]:
######################################################################
# Test
model = model.to("cpu")
def predict(text):
    ngrams = NGRAMS
    senti_label = ["bad", "good"]
    vocab = train_dataset.get_vocab()
    with torch.no_grad():
        text_tensor = torch.tensor([vocab[token]
                            for token in ngrams_iterator(ko_tokenizer(text), ngrams)])
        output = model(text_tensor, torch.tensor([0]))
        #print(output)
        pred_cls = output.argmax(1).item()
        print("sentiment: {} (comment={})".format(senti_label[pred_cls], text))

In [17]:
predict("영화 더럽게 재미없다. 돈 아까움..")

sentiment: bad (comment=영화 더럽게 재미없다. 돈 아까움..)


In [18]:
predict("영화 대박 재밌지")

sentiment: good (comment=영화 대박 재밌지)


In [19]:
predict("그럭저럭 볼만..")

sentiment: bad (comment=그럭저럭 볼만..)
