In [1]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.feature_selection import chi2

from news_vec.corpus import HeadlineDataset
from news_vec.encoder import read_preds

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [3]:
ds = HeadlineDataset.load('../../data/ava.p')

In [4]:
df = pd.DataFrame([r for r, _ in ds])

In [5]:
df['tokens_lower'] = df.tokens.apply(lambda ts: [t.lower() for t in ts])

In [6]:
df_train = df[df.split=='train']
df_test = df[df.split=='test']

In [7]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,3),
    token_pattern=None,
)

In [8]:
X_train = tv.fit_transform(df_train.tokens_lower)
X_test = tv.transform(df_test.tokens_lower)

In [9]:
clf = LogisticRegression(solver='sag', multi_class='multinomial', verbose=True, n_jobs=-1)
fit = clf.fit(X_train, df_train.domain)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 18 epochs took 27 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   28.3s finished


In [10]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(df_test.domain, y_test_pred)

In [11]:
acc

0.5163758152652917

In [12]:
print(metrics.classification_report(df_test.domain, y_test_pred))

                    precision    recall  f1-score   support

        apnews.com       0.47      0.48      0.47      1909
     bloomberg.com       0.51      0.58      0.54      1942
     breitbart.com       0.77      0.75      0.76      1910
      buzzfeed.com       0.57      0.79      0.67      1949
           cnn.com       0.69      0.26      0.37      1854
   dailycaller.com       1.00      0.86      0.92      1959
      dailykos.com       0.47      0.69      0.56      1990
       foxnews.com       0.35      0.37      0.36      1822
huffingtonpost.com       0.35      0.30      0.32      1802
         msnbc.com       0.46      0.57      0.51      1833
           npr.org       0.38      0.37      0.37      1874
       nytimes.com       0.45      0.34      0.39      1923
       thehill.com       0.42      0.56      0.48      1884
washingtonpost.com       0.59      0.45      0.51      1835
           wsj.com       0.44      0.33      0.37      1879

         micro avg       0.52      0.5

In [13]:
feature_names = tv.get_feature_names()

In [14]:
for d in df.domain.unique():
    scores, _ = chi2(X_train, df[df.split=='train'].domain == d)
    idx = np.argsort(scores)
    names = np.array(feature_names)[idx][-20:]
    names = ', '.join([n for n in names if '`' not in n])
    print(f'- **{d}** - {names}\n')

- **cnn.com** - the bell, know before the, : live, before the bell, premarket :, premarket, : live updates, fast facts, trump - cnn, - cnn.com, cnn.com, ? -, ' - cnn, ? - cnn, -, video, cnn, cnn video, - cnn video, - cnn

- **dailycaller.com** - ? via dailycaller, - the daily, ' [ video, caller, ' [, the daily caller, daily caller, ' via, ' via dailycaller, video ] via, [ video, video ], [ video ], ] via, ] via dailycaller, ], [, via, dailycaller, via dailycaller

- **breitbart.com** - illegal, - ', nolte, illegal aliens, delingpole :, amnesty, delingpole, report :, : ', cartel, |, ', ' |, ' | breitbart, ' -, ' - breitbart, -, | breitbart, - breitbart, breitbart

- **dailykos.com** - for night, round up :, night owls, for night owls, open thread for, thread for night, thread for, daily kos elections, kos elections, pundit, thread, cartoon :, abbreviated pundit, open thread, abbreviated, trumpcare, digest :, daily kos, kos, digest

- **npr.org** - top stories, top stories :, on mountain

In [34]:
df['title'] = df.tokens.apply(lambda ts: ' '.join(ts))

In [75]:
for t in df[df.title.str.contains('dems')].head(10).title:
    print(t)

GOP Senator blasts dems for releasing ' sensitive ' EPA documents on Pruitt security
Ad in Georgia election ties dems to Scalise shooting
THIS is why we sometimes ( grudgingly ) support " conservadems "


In [41]:
for t in df[df.title.str.contains('APNewsBreak')].head(10).title:
    print(t)

APNewsBreak : Indians removing Chief Wahoo logo from uniforms
APNewsBreak : $ 4 M for tiny Wisconsin airport near golf course
APNewsBreak : US suspects cellphone spying devices in DC
APNewsBreak : White nationalist to drop Ohio State lawsuit
APNewsBreak : Border wall models thwart US commandos in tests
APNewsBreak : Witness says he lied about casino gang killing
APNewsBreak : New governor toured Iowa on casino tycoon 's jet
APNewsBreak : Senator who freed Holt urges Venezuela dialogue
APNewsBreak : US yanks funds from unbuilt windmill farm
APNewsBreak : Kansas mental hospital fails federal review


In [105]:
for r in df[df.title.str.contains('Perspective')].head(10).itertuples():
    print('\n'.join([r.domain, r.title, ' '.join(r.clf_tokens)]))

washingtonpost.com
Perspective | What to do with an ugly symbol of racial violence ? Accession it into the Smithsonian .
what to do with an ugly symbol of racial violence accession it into the smithsonian
washingtonpost.com
Perspective | What Google and Facebook must do about one of their biggest problems
what google and facebook must do about one of their biggest problems
washingtonpost.com
Perspective | Are big poultry companies abusing SBA loans ?
are big poultry companies abusing sba loans
washingtonpost.com
Perspective | Where sweat and blood are not an uncommon occurrence : Calla Kessler on venturing into boxing 's red corner .
calla kessler on venturing into boxing s red corner
washingtonpost.com
Perspective | ' We all have the right to defend freedom ' : Transgender veterans speak out against Trump 's ban
we all have the right to defend freedom
washingtonpost.com
Perspective | Survey says veterans strongly back legalizing medical marijuana
survey says veterans strongly back leg

In [102]:
for r in df[df.clf_tokens.apply(lambda ts: 'dem' in ts)].head(30).itertuples():
    print(r.domain, r.title)

thehill.com Cuomo leads Nixon 2 to 1 in first poll of likely NY Dem primary voters
dailykos.com Dem Senators OFFICIALLY introduce bill to fix the single most obvious problem w / the ACA
apnews.com Trump opposition inspires Dem focus on statehouse wins
dailycaller.com Conyers Accusers Not To Be Believed Because They 're ' All White Women , ' According To Dem Congressman Via dailycaller
thehill.com Dem House candidate in West Virginia voted for Trump
dailycaller.com White House Reporters Correct Erroneous Report About Trump , John Kelly And Dem FISA Memo Via dailycaller
foxnews.com Trump plays hardball in tax reform kickoff , singles out Dem senator
thehill.com Top Senate Homeland Security Dem calls for select committee to investigate Russia influence in U.S. politics
thehill.com The Memo : Trump tries to deepen Dem divisions
foxnews.com Kavanaugh avoids Dem traps as chaotic hearing winds down , confirmation vote looms
dailycaller.com Dem PAC Encouraging Leftists To ' Take Out ' Scalise 

In [74]:
for t in df[df.domain=='foxnews.com'].head(200).title:
    print(t)

Pope Francis : Keep ' status quo ' in Jerusalem to avoid conflict
As North Korea threat looms , US Navy tests ballistic missile shootdown capability
' National disgrace ' : Community fights back as California overrun by homelessness , human waste , needles
Maxine Waters supporters burn American flag outside California rep 's office
The world 's most expensive taco is ridiculously expensive
Federal funding for Public Broadcasting faces elimination under Trump 's budget
Toddler suffers second degree burns from pacifier clip , mom claims
Florida inmate held on child porn charges allegedly tried to hire ' Rabbi ' to kill judge
WWII Army veteran honors wife 's memory by giving away flowers at retirement home
Letter opened at Virginia 's Joint Base Myer Henderson Hall triggers hazmat situation ; 11 fall ill
Ocasio Cortez : The Upper Middle Class ' Does n't Exist Anymore in America '
' They Should Be Walking With Blindfolds ' : Schumer Blasts GOP for ' TrumpCare II ' Bill
Ford Mustang found i