In [60]:
import pandas as pd
import numpy as np

from scipy import stats
from itertools import combinations

from news_vec.encoder import read_preds

In [61]:
df, _ = read_preds('../../data/ava-lstm-attn/')

100%|██████████| 283/283 [00:13<00:00, 21.15it/s]


In [62]:
DOMAINS = sorted(df.domain.unique().tolist())
PAIRS = [sorted(p) for p in combinations(DOMAINS, 2)]

In [63]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['domain_pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [64]:
df_test = df[df.split=='test']

In [65]:
rows = []

In [66]:
for d1, d2 in PAIRS:
    
    p1 = df_test[f'p_{d1}']
    p2 = df_test[f'p_{d2}']
    
    sp_corr, sp_p = stats.spearmanr(p1, p2)
    kt_corr, kt_p = stats.kendalltau(p1, p2)
    pr_corr, pr_p = stats.pearsonr(p1, p2)
    
    rows.append((d1, d2, sp_corr, sp_p, kt_corr, kt_p, pr_corr, pr_p))

In [67]:
corr_df = pd.DataFrame(rows, columns=('d1', 'd2', 'sp', 'sp_p', 'kt', 'kt_p', 'pr', 'pr_p'))

In [70]:
corr_df.sort_values('kt', ascending=False).head(10)

Unnamed: 0,d1,d2,sp,sp_p,kt,kt_p,pr,pr_p
26,bloomberg.com,wsj.com,0.730285,0.0,0.544591,0.0,0.2449,0.0
29,breitbart.com,dailycaller.com,0.688357,0.0,0.504639,0.0,0.235545,0.0
101,nytimes.com,wsj.com,0.672381,0.0,0.489553,0.0,0.158381,7.152194e-158
92,msnbc.com,thehill.com,0.660237,0.0,0.478135,0.0,0.121547,2.61182e-93
95,npr.org,nytimes.com,0.645672,0.0,0.468058,0.0,0.189297,7.303897e-226
43,buzzfeed.com,huffingtonpost.com,0.593501,0.0,0.440796,0.0,0.055717,7.584669e-21
66,dailycaller.com,thehill.com,0.597465,0.0,0.426649,0.0,0.138389,1.2025410000000001e-120
54,cnn.com,msnbc.com,0.573168,0.0,0.409234,0.0,0.152077,1.437426e-145
31,breitbart.com,foxnews.com,0.564974,0.0,0.404034,0.0,0.090747,1.1541570000000001e-52
98,npr.org,wsj.com,0.51779,0.0,0.371312,0.0,0.040728,7.736989e-12


In [71]:
corr_df.to_json('data/hl-graph-lstm-pcorr.json', orient='records', lines=True)