In [359]:
import sys
import io
import re
import logging
import pickle
import torch
import torchtext
import json
import pandas as pd

from torchtext.utils import unicode_csv_reader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

from tqdm import tqdm

In [360]:
file_name = '../data/News_Category_Dataset_v2.json'

In [361]:
with open(file_name) as f:
    contents = f.read()
    contents = '['+contents.replace('}','},')[:-2]+']' 
    data = json.loads(contents)

In [362]:
data[0]

{'category': 'CRIME',
 'headline': 'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
 'authors': 'Melissa Jeltsen',
 'link': 'https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89',
 'short_description': 'She left her husband. He killed their children. Just another day in America.',
 'date': '2018-05-26'}

In [363]:
politics = [news for news in data if news['category'] == 'POLITICS']
len(politics)

32739

In [364]:
non_politics = [news for news in data if news['category'] != 'POLITICS']
len(non_politics)

168114

In [365]:
import random

In [366]:
random.shuffle(non_politics)

In [367]:
non_politics_balanced = non_politics[0:int(len(politics)/3)]

In [368]:
df = pd.DataFrame(non_politics_balanced)

In [369]:
df.groupby('category').count()['headline']

category
ARTS               105
ARTS & CULTURE      87
BLACK VOICES       312
BUSINESS           382
COLLEGE             73
COMEDY             353
CRIME              204
CULTURE & ARTS      58
DIVORCE            211
EDUCATION           60
ENTERTAINMENT     1019
ENVIRONMENT         87
FIFTY               88
FOOD & DRINK       365
GOOD NEWS           83
GREEN              175
HEALTHY LIVING     458
HOME & LIVING      256
IMPACT             257
LATINO VOICES       53
MEDIA              172
MONEY               99
PARENTING          540
PARENTS            263
QUEER VOICES       394
RELIGION           175
SCIENCE            143
SPORTS             317
STYLE              149
STYLE & BEAUTY     657
TASTE              146
TECH               127
THE WORLDPOST      245
TRAVEL             651
WEDDINGS           251
WEIRD NEWS         163
WELLNESS          1206
WOMEN              212
WORLD NEWS         147
WORLDPOST          170
Name: headline, dtype: int64

In [370]:
dataset_balanced_json = non_politics_balanced + politics
random.shuffle(dataset_balanced_json)

In [371]:
def map_cat(cat):
    return '2' if cat == 'POLITICS' else '1'

dataset_balanced_all = [[map_cat(news['category']),news['headline']] for news in dataset_balanced_json]

In [372]:
LIMIT = 10000

In [375]:
import random
random.shuffle(dataset_balanced)

dataset_balanced = dataset_balanced_all[:LIMIT]
train_size = int(0.8*len(dataset_balanced))
train_data = dataset_balanced[:train_size]
test_data = dataset_balanced[train_size:]

In [376]:
import csv

with open("../data/political-news/train.csv", "w") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerows(train_data)
    
with open("../data/political-news/test.csv", "w") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerows(test_data)

## Initial configuration

In [377]:
DATA_DIR = '../data/political-news/'
NGRAMS = 2
EMBED_DIM = 32
N_EPOCHS = 5
BATCH_SIZE = 16
LABELS = {
     1 : "non-political",
     2 : "political"
}

# Dataset loader

First, let's define a helper class to load our datasets.

In [378]:
class TextClassificationDataset():
    def __init__(self, vocab, data, labels):
        super(TextClassificationDataset, self).__init__()
        self._data = data
        self._labels = labels
        self._vocab = vocab


    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_labels(self):
        return self._labels

    def get_vocab(self):
        return self._vocab

In [379]:
def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)


def _create_data_from_iterator(vocab, iterator, include_unk):
    data = []
    labels = []
    with tqdm(unit_scale=0, unit='lines') as t:
        for cls, tokens in iterator:
            if include_unk:
                tokens = torch.tensor([vocab[token] for token in tokens])
            else:
                token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token]
                                        for token in tokens]))
                tokens = torch.tensor(token_ids)
            if len(tokens) == 0:
                logging.info('Row contains no tokens.')
            data.append((cls, tokens))
            labels.append(cls)
            t.update(1)
    return data, set(labels)


In [380]:
def setup_datasets(root='./data', ngrams=1, vocab=None, include_unk=False):
    train_csv_path = root + '/train.csv'
    test_csv_path = root + '/test.csv'

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
            
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk)
    
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk)
    
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))



Text Classification
==================================

A bag of ngrams feature is applied to capture some partial information
about the local word order. 

In [381]:
train_dataset, test_dataset = setup_datasets(root=DATA_DIR, ngrams=NGRAMS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

8000lines [00:00, 40773.65lines/s]
8000lines [00:00, 14475.60lines/s]
2000lines [00:00, 22693.56lines/s]


Define the model
----------------

The model is composed of the
[EmbeddingBag](<https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>)
layer and the linear layer. ``nn.EmbeddingBag``
computes the mean value of a “bag” of embeddings. The text entries here
have different lengths. ``nn.EmbeddingBag`` requires no padding here
since the text lengths are saved in offsets.

Additionally, since ``nn.EmbeddingBag`` accumulates the average across
the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the
performance and memory efficiency to process a sequence of tensors.

![](../_static/img/text_sentiment_ngrams_model.png)





In [382]:
import torch.nn as nn
import torch.nn.functional as F

class MultiLabelTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

Initiate an instance of our model
--------------------

Our model has three labels and therefore the number of classes is three.

```
{
     0 : "non-political",
     1 : "political"
}
```
The vocab size is equal to the length of vocab (including single word
and ngrams). 

In [383]:
VOCAB_SIZE = len(train_dataset.get_vocab())
NUN_CLASS = len(train_dataset.get_labels())
model = MultiLabelTextClassifier(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [384]:
config = {
    'vocab_size':VOCAB_SIZE,
    'labels': LABELS,
    'ngrams': NGRAMS,
    'embeddings_dim': EMBED_DIM    
}

In [385]:
print(config)

{'vocab_size': 65124, 'labels': {1: 'non-political', 2: 'political'}, 'ngrams': 2, 'embeddings_dim': 32}


In [386]:
print(model)

MultiLabelTextClassifier(
  (embedding): EmbeddingBag(65124, 32, mode=mean)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)


In [387]:
with open(DATA_DIR + 'political_classifier.cfg','wb') as f:
    pickle.dump(config,f)
    f.close()

In [388]:
with open(DATA_DIR + 'political_classifier.vocab','wb') as f:
    pickle.dump(train_dataset.get_vocab(),f)
    f.close()

Generate batch
--------------------------------




Since the text entries have different lengths, a custom function
generate_batch() is used to generate data batches and offsets. The
function is passed to ``collate_fn`` in ``torch.utils.data.DataLoader``.
The input to ``collate_fn`` is a list of tensors with the size of
batch_size, and the ``collate_fn`` function packs them into a
mini-batch. 

The text entries in the original data batch input are packed into a list
and concatenated as a single tensor as the input of ``nn.EmbeddingBag``.
The offsets is a tensor of delimiters to represent the beginning index
of the individual sequence in the text tensor. Label is a tensor saving
the labels of individual text entries.




In [389]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

Functions to train the model and evaluate results.
---------------------------------------------------------




In [390]:
from torch.utils.data import DataLoader

def train_function(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test_function(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

Split the dataset and run the model
-----------------------------------

We split the training
dataset into train/valid sets with a split ratio of 0.95 (train) and
0.05 (valid). 

[CrossEntropyLoss](https://pytorch.org/docs/stable/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss)
criterion combines nn.LogSoftmax() and nn.NLLLoss() in a single class.

[SGD](https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html)
implements stochastic gradient descent method as optimizer. The initial
learning rate is set to 4.0.

[StepLR](https://pytorch.org/docs/master/_modules/torch/optim/lr_scheduler.html#StepLR)
is used here to adjust the learning rate through epochs.




In [391]:
import time
from torch.utils.data.dataset import random_split

min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_function(sub_train_)
    valid_loss, valid_acc = test_function(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 0 seconds
	Loss: 0.0323(train)	|	Acc: 74.9%(train)
	Loss: 0.0015(valid)	|	Acc: 82.2%(valid)
Epoch: 2  | time in 0 minutes, 0 seconds
	Loss: 0.0157(train)	|	Acc: 89.7%(train)
	Loss: 0.0011(valid)	|	Acc: 83.2%(valid)
Epoch: 3  | time in 0 minutes, 0 seconds
	Loss: 0.0055(train)	|	Acc: 97.6%(train)
	Loss: 0.0013(valid)	|	Acc: 82.2%(valid)
Epoch: 4  | time in 0 minutes, 0 seconds
	Loss: 0.0017(train)	|	Acc: 99.7%(train)
	Loss: 0.0016(valid)	|	Acc: 82.5%(valid)
Epoch: 5  | time in 0 minutes, 0 seconds
	Loss: 0.0007(train)	|	Acc: 100.0%(train)
	Loss: 0.0015(valid)	|	Acc: 83.8%(valid)


Evaluate the model with test dataset
------------------------------------




In [392]:
print('Checking the results of test dataset...')
test_loss, test_acc = test_function(test_dataset)
print(f'Acc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
Acc: 85.9%(test)


# Save model

In [393]:
torch.save(model.state_dict(), DATA_DIR + 'political_classifier.pth')

In [394]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "non-political",
                 2 : "political",
                }

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1
    
vocab = train_dataset.get_vocab()
model = model.to("cpu")

for i in range(10):
    pred_pol = predict(politics[i]['headline'], model, vocab, 2)
    pred_non_pol = predict(non_politics[i]['headline'], model, vocab, 2)
    print(non_politics[i]['headline'], pred_non_pol == 1)
    print(politics[i]['headline'], pred_pol == 2)
    print('---')

Leonardo DiCaprio Brings His Mom Irmelin As Date To The Oscars False
Trump's Crackdown On Immigrant Parents Puts More Kids In An Already Strained System True
---
You Are What You Eat True
'Trump's Son Should Be Concerned': FBI Obtained Wiretaps Of Putin Ally Who Met With Trump Jr. True
---
10 Things A Photographer Should Never Do While Photographing A Wedding True
Edward Snowden: There's No One Trump Loves More Than Vladimir Putin True
---
Elderly Woman Fires At Police Robot In Hours-Long Stand-Off: Cops False
Booyah: Obama Photographer Hilariously Trolls Trump's 'Spy' Claim True
---
The Movie Poster For 'Inauguration Day' Looks Pretty Damn Scary True
Ireland Votes To Repeal Abortion Amendment In Landslide Referendum True
---
The Power Of Video To Create Impact True
Ryan Zinke Looks To Reel Back Some Critics With 'Grand Pivot' To Conservation True
---
Kevin Bacon Footloose: Star Does Not Want To Hear His Famous Song At Weddings (VIDEO) True
Trump's Scottish Golf Resort Pays Women Signi