In [1]:
import pandas as pd

from tqdm import tqdm

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split

from news_vec.utils import read_json_gz_lines

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
df = pd.DataFrame(list(tqdm(read_json_gz_lines('../data/clf-articles.json/'))))

1130569it [00:17, 64642.42it/s]


In [10]:
df['tokens_key'] = df.clf_tokens.apply(lambda x: tuple(x))
df = df.drop_duplicates(['tokens_key', 'domain'])

In [14]:
min_count = df.groupby('domain').size().min()

In [15]:
def imp_acc(domain, cv=10):
    
    pdf = df[df.domain==domain].sample(min_count).copy()
    pdf['imp_rank'] = pdf.impressions.rank(pct=True)
    pdf['imp_high'] = pdf.imp_rank > 0.5
    
    X = pdf.clf_tokens
    y = pdf.imp_high
    
    tv = TfidfVectorizer(
        analyzer='word',
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        ngram_range=(1,3),
        token_pattern=None,
    )
    
    X = tv.fit_transform(X)
    
    clf = LinearSVC()
    return cross_val_score(clf, X, y, cv=cv)

In [17]:
for d in df.domain.unique():
    print(d, imp_acc(d))

wsj.com [0.54742345 0.53622106 0.52800597 0.5373413  0.54518297 0.53099328
 0.54314531 0.53811659 0.51980568 0.53213752]
dailycaller.com [0.55041075 0.55489171 0.54854369 0.5489171  0.54779686 0.5489171
 0.52297348 0.55754858 0.53736921 0.53961136]
huffingtonpost.com [0.58177745 0.58961912 0.58663181 0.58289768 0.60567588 0.58961912
 0.58759806 0.58557549 0.58669656 0.58109118]
foxnews.com [0.54929052 0.55601195 0.55339806 0.55003734 0.53584765 0.54630321
 0.55061636 0.53961136 0.55007474 0.55119581]
washingtonpost.com [0.58289768 0.57356236 0.58700523 0.57692308 0.57132188 0.57094847
 0.57639148 0.57361734 0.56801196 0.58669656]
sputniknews.com [0.51605676 0.49477222 0.50373413 0.52800597 0.50261389 0.50709485
 0.51624953 0.51644245 0.50186846 0.49215247]
nytimes.com [0.56497386 0.55526512 0.56572069 0.55974608 0.56721434 0.55787901
 0.58685095 0.55044843 0.56651719 0.56838565]
thehill.com [0.57281553 0.56011949 0.57094847 0.56646751 0.55675878 0.55003734
 0.57713859 0.57399103 0.5556