In [1]:
import warnings
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from itertools import islice
from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from news_vec import logger

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
warnings.simplefilter('ignore')

In [23]:
class Line:

    def __init__(self, tokens, label, lower=True):
        self.tokens = [t.lower() for t in tokens] if lower else tokens
        self.label = label

    def __repr__(self):

        pattern = '{cls_name}<{token_count} tokens -> {label}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            token_count=len(self.tokens),
            label=self.label,
        )
    
    def ngrams_iter(self, n, vocab=None):
        for ng in windowed(self.tokens, n):
            if not vocab or ng in vocab:
                yield ng
    
    def features_iter(self, vocab):
        yield from Counter(self.ngrams_iter(1, vocab)).items()
        yield from Counter(self.ngrams_iter(2, vocab)).items()
        yield from Counter(self.ngrams_iter(3, vocab)).items()
        
    def x(self, vocab):
        return dict(self.features_iter(vocab))

In [24]:
def read_json_lines(root, lower=True):
    """Generate links from a JSON corpus.

    Yields: list<str>
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:

                data = ujson.loads(line)

                tokens = data.get('tokens')

                if not tokens:
                    continue

                yield Line(tokens, data['label'], lower=lower)

In [25]:
class Corpus:

    def __init__(self, root, skim=None, lower=True):
        """Read lines.
        """
        logger.info('Parsing line corpus.')

        lines_iter = islice(read_json_lines(root, lower), skim)

        self.lines = list(tqdm(lines_iter))

    def __repr__(self):

        pattern = '{cls_name}<{line_count} lines>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            line_count=len(self),
        )

    def __len__(self):
        return len(self.lines)

    def __iter__(self):
        return iter(self.lines)
    
    def ngram_counts(self, n):
        """Collect all ngram -> count.
        """
        logger.info('Gathering %d-gram counts.' % n)

        counts = Counter()
        for line in tqdm(self):
            counts.update(line.ngrams_iter(n))

        return counts

    def topk_ngrams(self, n, k):
        counts = self.ngram_counts(n)
        return [ng for ng, _ in counts.most_common(k)]
    
    def x_iter(self, vocab):
        for line in tqdm(self):
            yield line.x(vocab)

In [26]:
c = Corpus('../data/titles-50k.json/')

2018-11-28 17:56:03,315 | INFO : Parsing line corpus.
2123078it [00:28, 74189.08it/s] 


In [27]:
vocab = set(
    c.topk_ngrams(1, 5000) +
    c.topk_ngrams(2, 5000) +
    c.topk_ngrams(3, 5000)
)

2018-11-28 17:57:24,753 | INFO : Gathering 1-gram counts.
100%|██████████| 2123078/2123078 [00:19<00:00, 106323.74it/s]
2018-11-28 17:57:44,809 | INFO : Gathering 2-gram counts.
100%|██████████| 2123078/2123078 [00:31<00:00, 68069.59it/s]
2018-11-28 17:58:17,010 | INFO : Gathering 3-gram counts.
100%|██████████| 2123078/2123078 [00:31<00:00, 67139.21it/s]


In [28]:
xs = list(c.x_iter(vocab))

100%|██████████| 2123078/2123078 [01:17<00:00, 27250.86it/s]


In [29]:
dv = DictVectorizer()
X = dv.fit_transform(xs)

In [30]:
X

<2123078x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 28074592 stored elements in Compressed Sparse Row format>

In [31]:
y = [line.label for line in c]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
model = LogisticRegression(verbose=True, multi_class='multinomial', solver='sag', n_jobs=-1)

In [34]:
fit = model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


max_iter reached after 651 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 10.9min finished


In [35]:
y_pred = fit.predict(X_test)

In [36]:
print(metrics.classification_report(y_test, y_pred))

                        precision    recall  f1-score   support

            apnews.com       0.24      0.29      0.26     10090
             bbc.co.uk       0.67      0.73      0.70     10230
         bloomberg.com       0.34      0.37      0.36     10218
         breitbart.com       0.72      0.68      0.70     10028
   businessinsider.com       0.44      0.43      0.44     10119
          buzzfeed.com       0.57      0.66      0.61     10045
                cbc.ca       0.59      0.63      0.61      9950
           cbsnews.com       0.29      0.29      0.29     10167
              cnbc.com       0.39      0.33      0.36     10163
               cnn.com       0.50      0.32      0.39     10175
       dailycaller.com       0.95      0.87      0.91     10134
          dailykos.com       0.46      0.53      0.49      9994
            forbes.com       0.30      0.35      0.32     10022
           foxnews.com       0.21      0.17      0.19      9965
                ft.com       0.33      

In [39]:
def mdw(domain, n=50):
    i = list(fit.classes_).index(domain)
    fis = np.flip(fit.coef_[i].argsort())[:n]
    names = [dv.feature_names_[fi] for fi in fis]
    return ', '.join([' '.join(ng) for ng in names])

In [40]:
for d in fit.classes_:
    print(d)
    print(mdw(d), '\n')

apnews.com
associated press news, apnewsbreak :, to know today, ap, the latest :, ap news :, column, associated press, mets, 1st, latest :, science says, police :, ap exclusive :, louisiana, ap explains :, maine, analysis :, en, lawyer :, cavs, arkansas, ap news, idaho, iowa, mississippi, gov't, court :, officials :, 2nd, pct, indians, official :, now :, poll :, yankees, 10 things to, nhl, news :, lebron, browns, 3rd, astros, carolina, study :, cardinals, fifa, nebraska, percent in, qb 

bbc.co.uk
- bbc, my report for, , series, bbc -, bbc, / 2017, - bbc radio, v, n korea, £, wales, reality check :, / 2018, nhs, bbc news, mum, boy ,, scottish, cup :, girl ,, scotland, : episode, . the, on this day, radio 1, rangers, bbc music, mp, 2018 :, corbyn, centre, sport, arsene wenger, fa, open :, st, : former, show -, bbc news |, live :, everton, , 2017 :, tottenham, england, boss, ' should, check :, mps, grenfell, guilty of 

bloomberg.com
brexit bulletin :, ( audio ), - bloomberg, bloomberg, 