In [1]:
import random
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2
from itertools import islice
from collections import Counter

from headline_parser import parse_headline

In [None]:
def read_json_gz_lines(root):
    """Read JSON corpus.

    Yields: dict
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:
                yield ujson.loads(line)

In [None]:
rows = set()
for row in tqdm(islice(read_json_gz_lines('data/cleaning-titles.json/'), None)):
    doc = parse_headline(row['title'])
    rows.add((doc._.span_clf_texts, row['domain']))

399620it [04:14, 993.76it/s]

In [None]:
df = pd.DataFrame(list(rows), columns=('spans', 'domain'))

In [None]:
min_count = df.groupby('domain').size().min()
df_sampled = df.groupby('domain').apply(lambda x: x.sample(min_count))

In [None]:
len(df)

In [None]:
len(df_sampled)

In [10]:
X, y = zip(*[(Counter(r.spans), r.domain) for r in df_sampled.itertuples()])

In [11]:
dv = DictVectorizer(sparse=True)

In [12]:
X = dv.fit_transform(X)

In [13]:
feature_names = np.array(dv.get_feature_names())

In [14]:
scores, p = chi2(X, y)

In [15]:
sdf = pd.DataFrame(list(zip(feature_names, scores, p)), columns=('span', 'chi', 'p'))

In [25]:
sdf[sdf.p<0.000001].sort_values('chi', ascending=False)

Unnamed: 0,span,chi,p
109366,dailycaller,310310.000000,0.000000e+00
71903,breitbart,219750.021012,0.000000e+00
96504,cnn video,101220.000000,0.000000e+00
405481,the daily caller,45584.000000,0.000000e+00
255299,listen now,31808.000000,0.000000e+00
41150,analysis,30806.459716,0.000000e+00
481835,video,29101.234184,0.000000e+00
314064,opinion,24712.996295,0.000000e+00
324531,perspective,16856.000000,0.000000e+00
410952,the latest,16577.459069,0.000000e+00


In [18]:
with open('blocklist.txt', 'w') as fh:
    for span in sdf[sdf.p<0.01].sort_values('chi', ascending=False).span:
        print(span, file=fh)

In [23]:
', '.join(sdf[sdf.p<0.0001].sort_values('chi', ascending=False).head(50).span)

'dailycaller, breitbart, cnn video, the daily caller, listen now, analysis, video, opinion, perspective, the latest, report, ap news, cnn, the huffington post, cartoon, fmr, markets wrap, d, open thread for night owls, morning digest, exclusive, midday open thread, matthews, abbreviated pundit round up, bloomberg, watch, poll, joe, review, cnnpolitics, lawrence, first listen, delingpole, trump, episode #, breaking, chart, paid program, r, top stories, bloomberg professional services, tiny desk concert, slideshow, abbreviated pundit roundup, police, the morning download, ap fact check, sign the petition, flashback, add your name'