In [107]:
import warnings
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from itertools import islice
from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from news_vec import logger

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
warnings.simplefilter('ignore')

In [57]:
class Line:

    def __init__(self, tokens, label, lower=True):
        self.tokens = [t.lower() for t in tokens] if lower else tokens
        self.label = label

    def __repr__(self):

        pattern = '{cls_name}<{token_count} tokens -> {label}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            token_count=len(self.tokens),
            label=self.label,
        )
    
    def ngrams_iter(self, n, vocab=None):
        for ng in windowed(self.tokens, n):
            if not vocab or ng in vocab:
                yield ng
    
    def features_iter(self, vocab):
        yield from Counter(self.ngrams_iter(1, vocab)).items()
        
    def x(self, vocab):
        return dict(self.features_iter(vocab))

In [58]:
def read_json_lines(root, lower=True):
    """Generate links from a JSON corpus.

    Yields: list<str>
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:

                data = ujson.loads(line)

                tokens = data.get('tokens')

                if not tokens:
                    continue

                yield Line(tokens, data['label'], lower=lower)

In [90]:
class Corpus:

    def __init__(self, root, skim=None, lower=True):
        """Read lines.
        """
        logger.info('Parsing line corpus.')

        lines_iter = islice(read_json_lines(root, lower), skim)

        self.lines = list(tqdm(lines_iter))

    def __repr__(self):

        pattern = '{cls_name}<{line_count} lines>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            line_count=len(self),
        )

    def __len__(self):
        return len(self.lines)

    def __iter__(self):
        return iter(self.lines)
    
    def ngram_counts(self, n):
        """Collect all ngram -> count.
        """
        logger.info('Gathering %d-gram counts.' % n)

        counts = Counter()
        for line in tqdm(self):
            counts.update(line.ngrams_iter(n))

        return counts

    def topk_ngrams(self, n, k):
        counts = self.ngram_counts(n)
        return [ng for ng, _ in counts.most_common(k)]
    
    def x_iter(self, vocab):
        for line in tqdm(self):
            yield line.x(vocab)

In [91]:
c = Corpus('../data/titles-50k.json/')

2018-11-28 13:38:31,839 | INFO : Parsing line corpus.
2123078it [00:26, 80958.91it/s] 


In [92]:
vocab = set(c.topk_ngrams(1, 1000))

2018-11-28 13:40:08,235 | INFO : Gathering 1-gram counts.
100%|██████████| 2123078/2123078 [00:19<00:00, 109424.38it/s]


In [93]:
xs = list(c.x_iter(vocab))

100%|██████████| 2123078/2123078 [00:29<00:00, 71026.99it/s]


In [94]:
dv = DictVectorizer()
X = dv.fit_transform(xs)

In [95]:
X

<2123078x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 15162987 stored elements in Compressed Sparse Row format>

In [96]:
y = [line.label for line in c]

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [98]:
model = LogisticRegression(verbose=True)
fit = model.fit(X_train, y_train)

[LibLinear]

In [99]:
y_pred = fit.predict(X_test)

In [100]:
print(metrics.classification_report(y_test, y_pred))

                        precision    recall  f1-score   support

            apnews.com       0.19      0.23      0.21     10158
             bbc.co.uk       0.60      0.65      0.62     10034
         bloomberg.com       0.24      0.26      0.25     10176
         breitbart.com       0.75      0.59      0.66     10087
   businessinsider.com       0.35      0.35      0.35     10151
          buzzfeed.com       0.38      0.58      0.46     10015
                cbc.ca       0.41      0.44      0.43     10098
           cbsnews.com       0.25      0.23      0.24     10194
              cnbc.com       0.29      0.24      0.26     10135
               cnn.com       0.57      0.26      0.36     10118
       dailycaller.com       0.99      0.86      0.92      9995
          dailykos.com       0.30      0.36      0.33     10221
            forbes.com       0.18      0.23      0.20     10133
           foxnews.com       0.14      0.07      0.10      9906
                ft.com       0.18      

In [118]:
def mdw(domain, n=50):
    i = list(fit.classes_).index(domain)
    fis = np.flip(fit.coef_[i].argsort())[:n]
    names = [dv.feature_names_[fi] for fi in fis]
    return ', '.join(['_'.join(ng) for ng in names])

In [127]:
for d in fit.classes_:
    print(d)
    print(mdw(d), '\n')

apnews.com
ap, latest, know, press, vegas, beat, us, cup, caller, analysis, governor, news, many, n, look, coach, year-old, lawmakers, california, ;, 2, leader, seeks, year, died, mexico, un, past, judge, players, town, games, 3, sen, tour, wins, abuse, win, game, lawsuit, lead, trip, summit, hope, official, debate, 4, epa, aid, try 

bbc.co.uk

bloomberg.com
said, k, sees, china's, billion, markets, banks, ), fund, bank, says, africa, re, fed, brexit, ceo, bitcoin, india, stocks, debt, oil, five, investors, economy, sale, seeks, start, isn't, push, may, macron, risk, biggest, bid, cars, u, market, since, cash, stock, growth, saudi, can't, japan, less, million, probe, know, sell, china 

breitbart.com
breitbart, illegal, israel, texas, border, hollywood, muslim, u, fake, terror, exclusive, state, fashion, percent, germany, donald, mass, :, iran, german, european, eu, hillary, sen, europe, climate, weinstein, sex, 13, rep, *, human, chicago, rape, nfl, gun, peace, christmas, french, six