In [26]:
from news_vec.corpus import Corpus

from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

In [2]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-24 21:57:43,795 | INFO : Reading links.
1225511it [00:03, 385465.78it/s]
2018-12-24 21:57:48,810 | INFO : Reading headlines.
1127502it [00:21, 53021.33it/s]


In [3]:
ds = corpus.sample_all_vs_all()

In [5]:
ds = ds.skim(10000)

In [6]:
ds

HeadlineDataset<8000/1000/1000>

In [22]:
def ngrams_iter(hl, n, vocab=None):
    for ng in windowed(hl['clf_tokens'], n):
        if not vocab or ng in vocab:
            yield ng

In [32]:
def ngram_counts(ds, n):
    counts = Counter()
    for hl, _ in tqdm(ds):
        counts.update(ngrams_iter(hl, n))
    return counts

In [35]:
def topk_ngrams(ds, n, k):
    counts = ngram_counts(ds, n)
    return [ng for ng, _ in counts.most_common(k)]

In [40]:
def features_iter(hl, vocab):
    yield from Counter(ngrams_iter(hl, 1, vocab)).items()
    yield from Counter(ngrams_iter(hl, 2, vocab)).items()
    yield from Counter(ngrams_iter(hl, 3, vocab)).items()

In [48]:
def make_x(hl, vocab):
    return dict(features_iter(hl, vocab))

In [49]:
vocab = set(
    topk_ngrams(ds, 1, 5000) +
    topk_ngrams(ds, 2, 5000) +
    topk_ngrams(ds, 3, 5000)
)

10000it [00:00, 63699.08it/s]
10000it [00:00, 64856.73it/s]
10000it [00:00, 66766.38it/s]


In [52]:
xs = [make_x(hl, vocab) for hl, _ in ds.train]

In [53]:
xs[0]

{('detained',): 1,
 ('migrants',): 1,
 ('describe',): 1,
 ('being',): 1,
 ('of',): 1,
 ('food',): 1,
 ('water',): 1,
 ('and',): 1,
 ('sleep',): 1,
 ('in',): 1,
 ('lawsuit',): 1,
 ('of', 'food'): 1,
 ('sleep', 'in'): 1,
 ('in', 'lawsuit'): 1,
 ('detained', 'migrants', 'describe'): 1,
 ('migrants', 'describe', 'being'): 1,
 ('describe', 'being', 'deprived'): 1,
 ('being', 'deprived', 'of'): 1,
 ('deprived', 'of', 'food'): 1,
 ('of', 'food', 'water'): 1,
 ('food', 'water', 'and'): 1,
 ('water', 'and', 'sleep'): 1,
 ('and', 'sleep', 'in'): 1,
 ('sleep', 'in', 'lawsuit'): 1}