In [2]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from news_vec.corpus import HeadlineDataset
from news_vec.encoder import read_preds

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [3]:
ds = HeadlineDataset.load('../data/ava.p')

In [4]:
ds

HeadlineDataset<225696/28212/28212>

In [5]:
train_df = pd.DataFrame([r for r, _ in ds.train])
test_df = pd.DataFrame([r for r, _ in ds.test])

In [6]:
X_train, y_train = train_df.clf_tokens, train_df.domain

In [7]:
X_test, y_test = test_df.clf_tokens, test_df.domain

In [8]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,4),
    token_pattern=None,
)

In [9]:
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)

# Linear SVC

In [10]:
clf = LinearSVC()
fit = clf.fit(X_train, y_train)

In [11]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(y_test, y_test_pred)

In [12]:
acc

0.36566000283567274

# Logistic regression

In [30]:
clf = LogisticRegression(solver='sag', multi_class='multinomial', verbose=True, n_jobs=-1)
fit = clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 21 epochs took 28 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   29.0s finished


In [31]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(y_test, y_test_pred)

In [32]:
acc

0.35463632496809866

# CBOW

In [106]:
df, embeds = read_preds('../data/ava-cbow/')

100%|██████████| 283/283 [00:22<00:00, 12.48it/s]


In [107]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [108]:
df_test = df[df.split=='test']

In [109]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.3156103785623139

# ATTN

In [14]:
df, embeds = read_preds('../data/ava-attn/')

100%|██████████| 283/283 [00:15<00:00, 18.07it/s]


In [15]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [16]:
df_test = df[df.split=='test']

In [17]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.3144761094569687

# CNN

In [18]:
df, embeds = read_preds('../data/ava-cnn-big/')

100%|██████████| 283/283 [00:16<00:00, 16.71it/s]


In [21]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [22]:
df_test = df[df.split=='test']

In [23]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.3509499503757266

# LSTM

In [9]:
df, embeds = read_preds('../data/ava-lstm/')

100%|██████████| 283/283 [00:14<00:00, 19.15it/s]


In [19]:
DOMAINS = list(df.domain.unique())

In [None]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [None]:
df_test = df[df.split=='test']

In [13]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.38554515808875656

# LSTM + attn

In [84]:
df, embeds = read_preds('../data/ava-lstm-attn/')

100%|██████████| 283/283 [00:19<00:00, 14.62it/s]


In [85]:
DOMAINS = list(df.domain.unique())

In [98]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [100]:
df_test = df[df.split=='test']

In [101]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.38338295760669217

# LSTM + CNN

In [4]:
df, embeds = read_preds('../data/ava-lstm-cnn/')

100%|██████████| 283/283 [00:14<00:00, 19.30it/s]


In [5]:
DOMAINS = list(df.domain.unique())

In [6]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [7]:
df_test = df[df.split=='test']

In [8]:
metrics.accuracy_score(df_test.domain, df_test.pred)

0.3831348362398979