In [1]:
import random
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2
from itertools import islice
from collections import Counter

from headline_parser import parse_headline

In [2]:
def read_json_gz_lines(root):
    """Read JSON corpus.

    Yields: dict
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:
                yield ujson.loads(line)

In [3]:
rows = set()
for row in tqdm(islice(read_json_gz_lines('data/cleaning-titles.json/'), None)):
    doc = parse_headline(row['title'])
    rows.add((doc._.span_clf_texts, row['domain']))

1941041it [20:49, 1553.97it/s]


In [8]:
df = pd.DataFrame(list(rows), columns=('spans', 'domain'))

In [9]:
min_count = df.groupby('domain').size().min()
df_sampled = df.groupby('domain').apply(lambda x: x.sample(min_count))

In [10]:
len(df)

1910216

In [11]:
len(df_sampled)

448635

In [12]:
X, y = zip(*[(Counter(r.spans), r.domain) for r in df_sampled.itertuples()])

In [13]:
dv = DictVectorizer(sparse=True)

In [14]:
X = dv.fit_transform(X)

In [15]:
feature_names = np.array(dv.get_feature_names())

In [16]:
scores, p = chi2(X, y)

In [17]:
sdf = pd.DataFrame(list(zip(feature_names, scores, p)), columns=('span', 'chi', 'p'))

In [23]:
blocklist = sdf[sdf.p<0.0001].sort_values('chi', ascending=False)

In [24]:
blocklist

Unnamed: 0,span,chi,p
100902,dailycaller,311192.000000,0.000000
66384,breitbart,219096.005749,0.000000
89069,cnn video,99750.000000,0.000000
380047,the daily caller,45178.000000,0.000000
238674,listen now,31430.000000,0.000000
39020,analysis,30245.365617,0.000000
456738,video,29095.698324,0.000000
293867,opinion,24799.477636,0.000000
303314,perspective,17976.000000,0.000000
385169,the latest,16256.206738,0.000000


In [27]:
with open('blocklist.txt', 'w') as fh:
    for span in blocklist.span:
        print(span, file=fh)

In [26]:
', '.join(blocklist.head(50).span)

'dailycaller, breitbart, cnn video, the daily caller, listen now, analysis, video, opinion, perspective, the latest, report, ap news, cnn, cnncom, cartoon, the huffington post, d, markets wrap, exclusive, open thread for night owls, morning digest, matthews, abbreviated pundit round up, midday open thread, watch, review, joe, poll, cnnpolitics, lawrence, first listen, episode #, delingpole, bloomberg, trump, bloomberg professional services, r, sign the petition, breaking, tiny desk concert, the morning download, top stories, chart, slideshow, police, the morning risk report, abbreviated pundit roundup, paid program, add your name, ap fact check'