In [1]:
import random
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2
from itertools import islice
from collections import Counter

from headline_parser import parse_headline

In [2]:
def read_json_gz_lines(root):
    """Read JSON corpus.

    Yields: dict
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:
                yield ujson.loads(line)

In [3]:
rows = set()
for row in tqdm(islice(read_json_gz_lines('data/cleaning-titles.json/'), None)):
    doc = parse_headline(row['title'])
    rows.add((doc._.span_clf_texts, row['domain']))

1941041it [19:08, 1690.31it/s]


In [4]:
df = pd.DataFrame(list(rows), columns=('spans', 'domain'))

In [5]:
min_count = df.groupby('domain').size().min()
df_sampled = df.groupby('domain').apply(lambda x: x.sample(min_count))

In [6]:
len(df)

1910216

In [7]:
len(df_sampled)

448635

In [8]:
X, y = zip(*[(Counter(r.spans), r.domain) for r in df_sampled.itertuples()])

In [9]:
dv = DictVectorizer(sparse=True)

In [10]:
X = dv.fit_transform(X)

In [11]:
feature_names = np.array(dv.get_feature_names())

In [12]:
scores, p = chi2(X, y)

In [13]:
sdf = pd.DataFrame(list(zip(feature_names, scores, p)), columns=('span', 'chi', 'p'))

In [14]:
sdf[sdf.p<0.001].sort_values('chi', ascending=False)

Unnamed: 0,span,chi,p
101538,dailycaller,310800.000000,0.000000
66879,breitbart,220548.020936,0.000000
89525,cnn video,99862.000000,0.000000
379831,the daily caller,44450.000000,0.000000
39135,analysis,32182.280230,0.000000
238937,listen now,31808.000000,0.000000
456880,video,29152.507726,0.000000
293881,opinion,25384.203832,0.000000
303401,perspective,16856.000000,0.000000
385210,the latest,15963.829775,0.000000


In [15]:
with open('blocklist.txt', 'w') as fh:
    for span in sdf[sdf.p<0.01].sort_values('chi', ascending=False).span:
        print(span, file=fh)

In [20]:
for s in df[df.apply(lambda r: '# things i wish someone told me before i launched my start up' in r.spans, 1)].spans:
    print(s)

('# things i wish someone told me before i launched my start up', 'josh tetrick ceo and co founder of hampton creek')
('# things i wish someone told me before i launched my start up', 'robert tallack ceo of conversionpoint technologies')
('# things i wish someone told me before i launched my start up', 'randy and angie stocklin founders of readerscom')
('# things i wish someone told me before i launched my start up', 'ashish rangnekar ceo and co founder of benchprep')
('# things i wish someone told me before i launched my start up', 'ran sarig co founder and ceo of datorama')
('# things i wish someone told me before i launched my start up', 'jason baudendistel founder of wibbets inc')
('# things i wish someone told me before i launched my start up', 'andy monfried founder and ceo lotame')
('# things i wish someone told me before i launched my start up', 'phoebe hugh co founder and ceo of brolly')
('# things i wish someone told me before i launched my start up', 'ayah bdeir founder and 