In [1]:
import pandas as pd
import numpy as np

from scipy import stats
from itertools import combinations

from news_vec.encoder import read_preds

In [2]:
df, _ = read_preds('../../data/ava-cbow/')

100%|██████████| 283/283 [00:13<00:00, 20.32it/s]


In [3]:
DOMAINS = sorted(df.domain.unique().tolist())
PAIRS = [sorted(p) for p in combinations(DOMAINS, 2)]

In [4]:
df['p_true'] = df.apply(lambda r: r[f'p_{r.domain}'], 1)
df['domain_pred'] = df.apply(lambda r: DOMAINS[np.argmax([r[f'p_{d}'] for d in DOMAINS])], 1)

In [5]:
df_test = df[df.split=='test']

In [6]:
rows = []

In [7]:
for d1, d2 in PAIRS:
    
    p1 = df_test[f'p_{d1}']
    p2 = df_test[f'p_{d2}']
    
    sp_corr, sp_p = stats.spearmanr(p1, p2)
    kt_corr, kt_p = stats.kendalltau(p1, p2)
    pr_corr, pr_p = stats.pearsonr(p1, p2)
    
    rows.append((d1, d2, sp_corr, sp_p, kt_corr, kt_p, pr_corr, pr_p))

In [8]:
corr_df = pd.DataFrame(rows, columns=('d1', 'd2', 'sp', 'sp_p', 'kt', 'kt_p', 'pr', 'pr_p'))

In [9]:
corr_df.sort_values('kt', ascending=False).head(10)

Unnamed: 0,d1,d2,sp,sp_p,kt,kt_p,pr,pr_p
26,bloomberg.com,wsj.com,0.838633,0.0,0.649152,0.0,0.581247,0.0
95,npr.org,nytimes.com,0.710882,0.0,0.524353,0.0,0.423397,0.0
43,buzzfeed.com,huffingtonpost.com,0.709984,0.0,0.524248,0.0,0.232072,0.0
29,breitbart.com,dailycaller.com,0.696504,0.0,0.510685,0.0,0.392066,0.0
92,msnbc.com,thehill.com,0.670869,0.0,0.483489,0.0,0.303908,0.0
71,dailykos.com,msnbc.com,0.652106,0.0,0.470737,0.0,0.211059,1.626165e-281
6,apnews.com,foxnews.com,0.608815,0.0,0.434015,0.0,0.386803,0.0
101,nytimes.com,wsj.com,0.575702,0.0,0.415452,0.0,0.197164,3.1698919999999997e-245
66,dailycaller.com,thehill.com,0.583952,0.0,0.414587,0.0,0.258344,0.0
97,npr.org,washingtonpost.com,0.50631,0.0,0.354214,0.0,0.216034,3.6538149999999997e-295


In [10]:
corr_df.to_json('data/hl-graph-cbow-pcorr.json', orient='records', lines=True)