In [1]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from news_vec.corpus import HeadlineDataset
from news_vec.encoder import read_preds

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [3]:
ds = HeadlineDataset.load('../../data/ava.p')

In [4]:
ds

HeadlineDataset<226920/28365/28365>

In [5]:
train_df = pd.DataFrame([r for r, _ in ds.train])
test_df = pd.DataFrame([r for r, _ in ds.test])

In [6]:
X_train, y_train = train_df.clf_tokens, train_df.domain

In [7]:
X_test, y_test = test_df.clf_tokens, test_df.domain

In [9]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,3),
    token_pattern=None,
)

In [10]:
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)

# Linear SVC

In [11]:
clf = LinearSVC()
fit = clf.fit(X_train, y_train)

In [12]:
y_test_pred = fit.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.3860391327340032

# Logistic regression

In [14]:
clf = LogisticRegression(solver='sag', multi_class='multinomial', verbose=True, n_jobs=-1)
fit = clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 19 epochs took 24 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   25.5s finished


In [15]:
y_test_pred = fit.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.36583818085668957

# CBOW

In [20]:
DOMAINS = list(set([domain for _, domain in ds]))

In [21]:
def read_test_df(pred_root):
    df, _ = read_preds(pred_root)
    df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)
    return df[df.split=='test']

In [22]:
df = read_test_df('../../data/ava-cbow/')

100%|██████████| 284/284 [00:14<00:00, 19.52it/s]


In [24]:
metrics.accuracy_score(df.domain, df.pred)

0.33597743698219634

# CNN

In [27]:
df = read_test_df('../../data/ava-cnn/')

100%|██████████| 284/284 [00:15<00:00, 18.78it/s]


In [28]:
metrics.accuracy_score(df.domain, df.pred)

0.34856337035078444

# LSTM

In [29]:
df = read_test_df('../../data/ava-lstm/')

100%|██████████| 284/284 [00:13<00:00, 21.71it/s]


In [30]:
metrics.accuracy_score(df.domain, df.pred)

0.40359598096245375

In [31]:
print(metrics.classification_report(df.domain, df.pred))

                    precision    recall  f1-score   support

        apnews.com       0.44      0.51      0.47      1909
     bloomberg.com       0.53      0.56      0.54      1942
     breitbart.com       0.44      0.43      0.43      1910
      buzzfeed.com       0.73      0.66      0.69      1949
           cnn.com       0.28      0.20      0.23      1854
   dailycaller.com       0.29      0.38      0.33      1959
      dailykos.com       0.56      0.57      0.56      1990
       foxnews.com       0.32      0.24      0.28      1822
huffingtonpost.com       0.34      0.26      0.30      1802
         msnbc.com       0.38      0.56      0.45      1833
           npr.org       0.39      0.26      0.31      1874
       nytimes.com       0.34      0.33      0.33      1923
       thehill.com       0.32      0.42      0.36      1884
washingtonpost.com       0.30      0.30      0.30      1835
           wsj.com       0.38      0.36      0.37      1879

         micro avg       0.40      0.4

# LSTM + attn

In [32]:
df = read_test_df('../../data/ava-lstm-attn/')

100%|██████████| 284/284 [00:13<00:00, 21.05it/s]


In [33]:
metrics.accuracy_score(df.domain, df.pred)

0.41357306539749694

# LSTM + CNN

In [34]:
df = read_test_df('../../data/ava-lstm-cnn/')

100%|██████████| 284/284 [00:14<00:00, 25.65it/s]


In [35]:
metrics.accuracy_score(df.domain, df.pred)

0.4095540278512251