In [1]:
import warnings
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from itertools import islice
from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

from news_vec import logger
from news_vec.title_clf import clean_headline

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
warnings.simplefilter('ignore')

In [4]:
class Line:

    def __init__(self, tokens, label, lower=True):
        self.tokens = [t.lower() for t in tokens] if lower else tokens
        self.label = label

    def __repr__(self):

        pattern = '{cls_name}<{token_count} tokens -> {label}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            token_count=len(self.tokens),
            label=self.label,
        )
    
    def ngrams(self, n):
        return windowed(self.tokens, n)
                
    def ngram_keys_iter(self, n, vocab=None):
        for ng in self.ngrams(n):
            if not vocab or ng in vocab:
                yield '_' + '_'.join(ng)
                
    def ngram_counts_iter(self, n, vocab=None):
        yield from Counter(self.ngram_keys_iter(n, vocab)).items()
    
    def features_iter(self, vocab):
        yield from self.ngram_counts_iter(1, vocab)
        yield from self.ngram_counts_iter(2, vocab)
        yield from self.ngram_counts_iter(3, vocab)
        
    def x(self, vocab):
        return dict(self.features_iter(vocab))

In [5]:
def read_json_lines(root, lower=True):
    """Generate links from a JSON corpus.

    Yields: list<str>
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:

                data = ujson.loads(line)

                tokens = data.get('tokens')
                tokens = clean_headline(tokens)

                if not tokens:
                    continue

                yield Line(tokens, data['label'], lower=lower)

In [6]:
class Corpus:

    def __init__(self, root, skim=None, lower=True):
        """Read lines.
        """
        logger.info('Parsing line corpus.')

        lines_iter = islice(read_json_lines(root, lower), skim)

        self.lines = list(tqdm(lines_iter))

    def __repr__(self):

        pattern = '{cls_name}<{line_count} lines>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            line_count=len(self),
        )

    def __len__(self):
        return len(self.lines)

    def __iter__(self):
        return iter(self.lines)
    
    def ngram_counts(self, n):
        """Collect all ngram -> count.
        """
        logger.info('Gathering %d-gram counts.' % n)

        counts = Counter()
        for line in tqdm(self):
            counts.update(line.ngrams(n))

        return counts

    def topk_ngrams(self, n, k):
        counts = self.ngram_counts(n)
        return [ng for ng, _ in counts.most_common(k)]
    
    def x_iter(self, vocab):
        for line in tqdm(self):
            yield line.x(vocab)

In [7]:
c = Corpus('../data/titles-50k.json/')

2018-12-02 14:18:35,115 | INFO : Parsing line corpus.
2123078it [00:52, 40332.97it/s]


In [8]:
vocab = set(
    c.topk_ngrams(1, 5000) +
    c.topk_ngrams(2, 5000) +
    c.topk_ngrams(3, 5000)
)

2018-12-02 14:22:41,082 | INFO : Gathering 1-gram counts.
100%|██████████| 2123078/2123078 [00:20<00:00, 102873.47it/s]
2018-12-02 14:23:01,777 | INFO : Gathering 2-gram counts.
100%|██████████| 2123078/2123078 [00:25<00:00, 84458.04it/s]
2018-12-02 14:23:27,719 | INFO : Gathering 3-gram counts.
100%|██████████| 2123078/2123078 [00:23<00:00, 90478.06it/s]


In [9]:
xs = list(c.x_iter(vocab))

100%|██████████| 2123078/2123078 [01:06<00:00, 32075.87it/s]


In [10]:
dv = DictVectorizer()
X = dv.fit_transform(xs)

In [11]:
X

<2123078x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 23141100 stored elements in Compressed Sparse Row format>

In [12]:
y = [line.label for line in c]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
model = MultinomialNB()

In [15]:
fit = model.fit(X_train, y_train)

In [16]:
y_pred = fit.predict(X_test)

In [17]:
print(metrics.classification_report(y_test, y_pred))

                        precision    recall  f1-score   support

            apnews.com       0.14      0.20      0.16     10032
             bbc.co.uk       0.43      0.34      0.38     10191
         bloomberg.com       0.32      0.20      0.24     10159
         breitbart.com       0.22      0.16      0.18     10055
   businessinsider.com       0.23      0.25      0.24     10207
          buzzfeed.com       0.38      0.58      0.46      9979
                cbc.ca       0.39      0.39      0.39     10046
           cbsnews.com       0.15      0.08      0.10     10206
              cnbc.com       0.21      0.25      0.23     10038
               cnn.com       0.15      0.03      0.05     10099
       dailycaller.com       0.25      0.17      0.20     10309
          dailykos.com       0.26      0.33      0.29     10029
            forbes.com       0.18      0.27      0.22     10011
           foxnews.com       0.13      0.08      0.10     10053
                ft.com       0.25      