In [1]:
import random

import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2
from itertools import islice
from collections import Counter

from span_clf import read_json_gz_lines
from headline_parser import parse_headline

In [2]:
rows = set()
for row in tqdm(islice(read_json_gz_lines('data/cleaning-titles.json/'), 100000)):
    doc = parse_headline(row['title'])
    rows.add((doc._.span_clf_texts, row['domain']))

100000it [00:55, 1810.97it/s]


In [3]:
df = pd.DataFrame(list(rows), columns=('spans', 'domain'))

In [4]:
min_count = df.groupby('domain').size().min()
df_sampled = df.groupby('domain').apply(lambda x: x.sample(min_count))

In [5]:
len(df)

99852

In [6]:
len(df_sampled)

22425

In [7]:
X, y = zip(*[(Counter(r.spans), r.domain) for r in df_sampled.itertuples()])

In [8]:
dv = DictVectorizer(sparse=True)

In [9]:
X = dv.fit_transform(X)

In [10]:
feature_names = np.array(dv.get_feature_names())

In [11]:
scores, p = chi2(X, y)

In [12]:
df = pd.DataFrame(list(zip(feature_names, scores, p)), columns=('span', 'chi', 'p'))

In [13]:
df[df.p<0.01].sort_values('chi', ascending=False)

Unnamed: 0,span,chi,p
5413,dailycaller,15764.000000,0.000000e+00
3591,breitbart,11130.000000,0.000000e+00
4760,cnn video,4802.000000,0.000000e+00
20224,the daily caller,2086.000000,0.000000e+00
12705,listen now,1582.000000,0.000000e+00
2055,analysis,1547.627907,0.000000e+00
24246,video,1302.102564,1.907191e-269
15642,opinion,1109.537118,4.776708e-228
16168,perspective,882.000000,3.100046e-179
20487,the latest,879.921348,8.641862e-179


In [15]:
with open('blocklist.txt', 'w') as fh:
    for span in df[df.p<0.01].sort_values('chi', ascending=False).span:
        print(span, file=fh)