In [28]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from news_vec.corpus import HeadlineDataset, Corpus
from news_vec.encoder import read_preds

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [3]:
ds = HeadlineDataset.load('../data/ava.p')

In [4]:
ds

HeadlineDataset<225696/28212/28212>

In [5]:
train_df = pd.DataFrame([r for r, _ in ds.train])
test_df = pd.DataFrame([r for r, _ in ds.test])

In [14]:
X_train, y_train = train_df.clf_tokens, train_df.domain

In [15]:
X_test, y_test = test_df.clf_tokens, test_df.domain

In [16]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,3),
    token_pattern=None,
)

In [17]:
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)

In [18]:
clf = LinearSVC()
fit = clf.fit(X_train, y_train)

In [19]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(y_test, y_test_pred)

In [20]:
acc

0.3655536651070466

In [22]:
test_df['pred'] = y_test_pred

In [25]:
Counter(test_df[test_df.domain=='huffingtonpost.com'].pred)

Counter({'dailykos.com': 184,
         'dailycaller.com': 88,
         'huffingtonpost.com': 437,
         'washingtonpost.com': 128,
         'buzzfeed.com': 285,
         'thehill.com': 74,
         'wsj.com': 60,
         'breitbart.com': 94,
         'npr.org': 80,
         'cnn.com': 45,
         'bloomberg.com': 50,
         'msnbc.com': 75,
         'foxnews.com': 114,
         'nytimes.com': 74,
         'apnews.com': 42})

In [29]:
corpus = Corpus('../data/clf-articles.json/')

2019-01-06 11:33:51,987 | INFO : Reading headlines.
1081790it [00:15, 68720.69it/s] 


In [30]:
ab = corpus.sample_ab('huffingtonpost.com', 'buzzfeed.com')

Unnamed: 0_level_0,Unnamed: 1_level_0,article_id,clf_tokens,domain,impressions,tokens,window,windows
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
buzzfeed.com,38533,936302894288,"[these, three, pictures, of, jon, hamm, will, ...",buzzfeed.com,25012,"[These, Three, Pictures, Of, Jon, Hamm, Will, ...",15,"[15, 12, 19, 16, 13, 20, 17, 18, 14, 11]"
buzzfeed.com,1046525,506806151522,"[the, #, best, #, worst, and, #, most, underra...",buzzfeed.com,125757,"[The, 11, Best, ,, 11, Worst, ,, And, 11, Most...",88,"[84, 81, 88, 85, 82, 86, 83, 87, 79, 80]"
buzzfeed.com,583319,1005022349917,"[choose, your, ideal, golden, globe, winners, ...",buzzfeed.com,19653,"[Choose, Your, Ideal, Golden, Globe, Winners, ...",59,"[51, 52, 56, 53, 50, 57, 54, 58, 55, 59]"
buzzfeed.com,967071,197568496982,"[this, #, year, old, grandma, did, a, keg, sta...",buzzfeed.com,54622,"[This, 79, Year, Old, Grandma, Did, A, Keg, St...",24,"[15, 19, 16, 20, 17, 24, 21, 18, 25, 22, 23]"
buzzfeed.com,1035523,1340029822594,"[yes, malcolm, turnbull, took, the, japanese, ...",buzzfeed.com,48393,"[Yes, ,, Malcolm, Turnbull, Took, The, Japanes...",0,"[0, 1, 2]"
buzzfeed.com,535755,1271310338883,"[want, to, be, best, friends, with]",buzzfeed.com,11573,"[19, Women, You, 'd, 100, %, Want, To, Be, Bes...",12,"[15, 12, 9, 13, 10, 7, 14, 11, 8]"
buzzfeed.com,865704,25769834866,"[the, stranger, things, kids, looked, so, fric...",buzzfeed.com,249835,"[The, "", Stranger, Things, "", Kids, Looked, So...",1,"[0, 1]"
buzzfeed.com,443278,738734389524,"[travis, reinking, is, on, suicide, watch, and...",buzzfeed.com,27624,"[Travis, Reinking, Is, On, Suicide, Watch, And...",73,"[70, 67, 74, 71, 68, 75, 72, 69, 76, 73]"
buzzfeed.com,768534,1365799643392,"[why, are, we, obsessed, with, how, and, when,...",buzzfeed.com,90166,"[Why, Are, We, Obsessed, With, How, And, When,...",37,"[45, 37, 38, 42, 39, 36, 43, 40, 44, 41]"
buzzfeed.com,381858,369367204834,"[if, these, first, lines, in, songs, do, nt, t...",buzzfeed.com,11743,"[If, These, First, Lines, In, Songs, Do, n't, ...",49,"[51, 48, 52, 49, 46, 53, 50, 54, 55, 47]"
