In [1]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from news_vec.corpus import HeadlineDataset, Corpus
from news_vec.encoder import read_preds

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [3]:
ds = HeadlineDataset.load('../data/ava.p')

In [4]:
ds

HeadlineDataset<225696/28212/28212>

In [5]:
train_df = pd.DataFrame([r for r, _ in ds.train])
test_df = pd.DataFrame([r for r, _ in ds.test])

In [6]:
X_train, y_train = train_df.clf_tokens, train_df.domain

In [7]:
X_test, y_test = test_df.clf_tokens, test_df.domain

In [8]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,3),
    token_pattern=None,
)

In [9]:
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)

In [10]:
clf = LogisticRegression(solver='sag', multi_class='multinomial', verbose=True, n_jobs=-1)
fit = clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 20 epochs took 27 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   28.4s finished


In [11]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(y_test, y_test_pred)

In [12]:
acc

0.35463632496809866

In [13]:
preds = fit.predict_proba(X_test)

In [14]:
preds

array([[0.02478921, 0.02878849, 0.03348038, ..., 0.0170325 , 0.07497745,
        0.01816498],
       [0.04781053, 0.37706885, 0.0456596 , ..., 0.01488209, 0.03939159,
        0.07429986],
       [0.13820373, 0.04148529, 0.06777497, ..., 0.01915928, 0.04815307,
        0.03420666],
       ...,
       [0.19643635, 0.03019237, 0.03837875, ..., 0.03660435, 0.21132459,
        0.0281072 ],
       [0.00322233, 0.02538073, 0.01163462, ..., 0.02521029, 0.20927343,
        0.00333474],
       [0.00514983, 0.01577472, 0.01578081, ..., 0.0040776 , 0.06088447,
        0.00855814]])

In [15]:
fit.classes_

array(['apnews.com', 'bloomberg.com', 'breitbart.com', 'buzzfeed.com',
       'cnn.com', 'dailycaller.com', 'dailykos.com', 'foxnews.com',
       'huffingtonpost.com', 'msnbc.com', 'npr.org', 'nytimes.com',
       'thehill.com', 'washingtonpost.com', 'wsj.com'], dtype=object)

In [18]:
for i1, i2 in combinations(range(len(fit.classes_)), 2):
    print(fit.classes_[i1], fit.classes_[i2], stats.pearsonr(preds[:,i1], preds[:,i2]))

apnews.com bloomberg.com (-0.046729391097314714, 4.0681606443376236e-15)
apnews.com breitbart.com (0.05278007334865178, 7.251969352167192e-19)
apnews.com buzzfeed.com (-0.23143861143720848, 0.0)
apnews.com cnn.com (0.1390076153604599, 1.0145494406834358e-121)
apnews.com dailycaller.com (-0.07700190781812016, 2.2822701644480736e-38)
apnews.com dailykos.com (-0.23301377475770513, 0.0)
apnews.com foxnews.com (0.3532960466991273, 0.0)
apnews.com huffingtonpost.com (-0.16959673446117504, 4.6631593728870214e-181)
apnews.com msnbc.com (-0.1449978612688328, 2.248940404931962e-132)
apnews.com npr.org (0.06521520104300214, 5.629295727724336e-28)
apnews.com nytimes.com (-0.025393206383857392, 1.993206289257466e-05)
apnews.com thehill.com (-0.018269963590636824, 0.0021490498388104834)
apnews.com washingtonpost.com (-0.17180172207918687, 8.191452089396561e-186)
apnews.com wsj.com (0.014841160428090226, 0.012673604481253221)
bloomberg.com breitbart.com (-0.12105169102471722, 1.4628492587519549e-92)
