In [47]:
import sys
import io
import re
import logging
import pickle
import torch
import torchtext

from torchtext.utils import unicode_csv_reader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

from tqdm import tqdm

## Initial configuration

In [48]:
DATA_DIR = '../data/political-stance-news/'
NGRAMS = 2
EMBED_DIM = 32
N_EPOCHS = 5
BATCH_SIZE = 16
LABELS = {
     1 : "left",
     2 : "center",
     3 : "right"
}

# Dataset loader

First, let's define a helper class to load our datasets.

In [49]:
class TextClassificationDataset():
    def __init__(self, vocab, data, labels):
        super(TextClassificationDataset, self).__init__()
        self._data = data
        self._labels = labels
        self._vocab = vocab


    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_labels(self):
        return self._labels

    def get_vocab(self):
        return self._vocab

In [50]:
def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)


def _create_data_from_iterator(vocab, iterator, include_unk):
    data = []
    labels = []
    with tqdm(unit_scale=0, unit='lines') as t:
        for cls, tokens in iterator:
            if include_unk:
                tokens = torch.tensor([vocab[token] for token in tokens])
            else:
                token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token]
                                        for token in tokens]))
                tokens = torch.tensor(token_ids)
            if len(tokens) == 0:
                logging.info('Row contains no tokens.')
            data.append((cls, tokens))
            labels.append(cls)
            t.update(1)
    return data, set(labels)


In [51]:
def setup_datasets(root='./data', ngrams=1, vocab=None, include_unk=False):
    train_csv_path = root + '/train.csv'
    test_csv_path = root + '/test.csv'

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
            
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk)
    
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk)
    
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))



Text Classification
==================================

A bag of ngrams feature is applied to capture some partial information
about the local word order. 

In [52]:
train_dataset, test_dataset = setup_datasets(root=DATA_DIR, ngrams=NGRAMS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

640lines [00:00, 25711.22lines/s]
640lines [00:00, 12583.34lines/s]
161lines [00:00, 14292.00lines/s]


Define the model
----------------

The model is composed of the
[EmbeddingBag](<https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>)
layer and the linear layer. ``nn.EmbeddingBag``
computes the mean value of a “bag” of embeddings. The text entries here
have different lengths. ``nn.EmbeddingBag`` requires no padding here
since the text lengths are saved in offsets.

Additionally, since ``nn.EmbeddingBag`` accumulates the average across
the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the
performance and memory efficiency to process a sequence of tensors.

![](../_static/img/text_sentiment_ngrams_model.png)





In [53]:
import torch.nn as nn
import torch.nn.functional as F

class MultiLabelTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

Initiate an instance of our model
--------------------

Our model has three labels and therefore the number of classes is three.

```
{
     1 : "left",
     2 : "center",
     3 : "right"
}
```
The vocab size is equal to the length of vocab (including single word
and ngrams). 

In [58]:
VOCAB_SIZE = len(train_dataset.get_vocab())
NUN_CLASS = len(train_dataset.get_labels())
model = MultiLabelTextClassifier(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [59]:
config = {
    'vocab_size':VOCAB_SIZE,
    'labels': LABELS,
    'ngrams': NGRAMS,
    'embeddings_dim': EMBED_DIM    
}

In [60]:
with open(DATA_DIR + 'classifier.cfg','wb') as f:
    pickle.dump(config,f)
    f.close()

In [61]:
with open(DATA_DIR + 'classifier.vocab','wb') as f:
    pickle.dump(train_dataset.get_vocab(),f)
    f.close()

Generate batch
--------------------------------




Since the text entries have different lengths, a custom function
generate_batch() is used to generate data batches and offsets. The
function is passed to ``collate_fn`` in ``torch.utils.data.DataLoader``.
The input to ``collate_fn`` is a list of tensors with the size of
batch_size, and the ``collate_fn`` function packs them into a
mini-batch. 

The text entries in the original data batch input are packed into a list
and concatenated as a single tensor as the input of ``nn.EmbeddingBag``.
The offsets is a tensor of delimiters to represent the beginning index
of the individual sequence in the text tensor. Label is a tensor saving
the labels of individual text entries.




In [62]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

Functions to train the model and evaluate results.
---------------------------------------------------------




In [63]:
from torch.utils.data import DataLoader

def train_function(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test_function(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

Split the dataset and run the model
-----------------------------------

We split the training
dataset into train/valid sets with a split ratio of 0.95 (train) and
0.05 (valid). 

[CrossEntropyLoss](https://pytorch.org/docs/stable/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss)
criterion combines nn.LogSoftmax() and nn.NLLLoss() in a single class.

[SGD](https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html)
implements stochastic gradient descent method as optimizer. The initial
learning rate is set to 4.0.

[StepLR](https://pytorch.org/docs/master/_modules/torch/optim/lr_scheduler.html#StepLR)
is used here to adjust the learning rate through epochs.




In [64]:
import time
from torch.utils.data.dataset import random_split

min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_function(sub_train_)
    valid_loss, valid_acc = test_function(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 0 seconds
	Loss: 0.0709(train)	|	Acc: 38.2%(train)
	Loss: 0.0642(valid)	|	Acc: 37.5%(valid)
Epoch: 2  | time in 0 minutes, 0 seconds
	Loss: 0.0543(train)	|	Acc: 59.0%(train)
	Loss: 0.0503(valid)	|	Acc: 59.4%(valid)
Epoch: 3  | time in 0 minutes, 0 seconds
	Loss: 0.0307(train)	|	Acc: 84.0%(train)
	Loss: 0.0690(valid)	|	Acc: 53.1%(valid)
Epoch: 4  | time in 0 minutes, 0 seconds
	Loss: 0.0183(train)	|	Acc: 93.1%(train)
	Loss: 0.0642(valid)	|	Acc: 56.2%(valid)
Epoch: 5  | time in 0 minutes, 0 seconds
	Loss: 0.0106(train)	|	Acc: 98.4%(train)
	Loss: 0.0719(valid)	|	Acc: 59.4%(valid)


Evaluate the model with test dataset
------------------------------------




In [65]:
print('Checking the results of test dataset...')
test_loss, test_acc = test_function(test_dataset)
print(f'Acc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
Acc: 71.4%(test)


# Save model

In [67]:
torch.save(model.state_dict(), DATA_DIR + 'classifier.pth')