In [38]:
import numpy as np
import torch

from itertools import islice, combinations, chain
from tqdm import tqdm
from collections import defaultdict, Counter
from cached_property import cached_property
from scipy import random

from news_vec.utils import read_json_gz_lines
from news_vec import logger

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [21]:
class Actors:
    
    def __init__(self, root, skim=None):
        self.root = root
        self.skim = skim
        
    def __iter__(self):
        reader = islice(read_json_gz_lines(self.root), self.skim)
        yield from tqdm(reader)

In [22]:
ac = Actors('../data/clf-actors.json/', 10000000)

In [23]:
ACTOR_DOMAINS = defaultdict(set)

for row in ac:
    ACTOR_DOMAINS[row['actor_id']].add(row['domain'])

10000000it [00:35, 277958.37it/s]


In [24]:
ACTOR_DOMAINS['id:twitter.com:2793391472']

{'apnews.com',
 'buzzfeed.com',
 'cnn.com',
 'dailykos.com',
 'foxnews.com',
 'huffingtonpost.com',
 'npr.org',
 'nytimes.com',
 'thehill.com',
 'washingtonpost.com',
 'wsj.com'}

In [25]:
DOMAIN_PAIRS = []
for _, domains in tqdm(ACTOR_DOMAINS.items()):
    if len(domains)==2:
        DOMAIN_PAIRS.append(tuple(sorted(domains)))

100%|██████████| 2662161/2662161 [00:01<00:00, 1746306.90it/s]


In [27]:
len(DOMAIN_PAIRS)

377959

In [28]:
DOMAIN_PAIR_COUNTS = Counter(DOMAIN_PAIRS)

In [29]:
DOMAINS, DOMAIN_COUNTS = zip(*Counter(chain(*map(list, DOMAIN_PAIRS))).most_common())

In [30]:
DOMAINS

('cnn.com',
 'nytimes.com',
 'huffingtonpost.com',
 'foxnews.com',
 'buzzfeed.com',
 'bloomberg.com',
 'washingtonpost.com',
 'apnews.com',
 'breitbart.com',
 'thehill.com',
 'npr.org',
 'dailycaller.com',
 'rt.com',
 'wsj.com',
 'dailykos.com',
 'sputniknews.com')

In [73]:
DOMAIN_COUNTS

(186687,
 71015,
 69620,
 69077,
 68779,
 52480,
 39316,
 38942,
 37928,
 37121,
 35194,
 21978,
 9961,
 9447,
 6797,
 1576)

In [91]:
DOMAIN_PRIOR = np.array(DOMAIN_COUNTS)
DOMAIN_PRIOR = DOMAIN_PRIOR / DOMAIN_PRIOR.sum()
DOMAIN_PRIOR = torch.from_numpy(DOMAIN_PRIOR).view(1, -1)

In [92]:
DOMAIN_PRIOR

tensor([[0.2470, 0.0939, 0.0921, 0.0914, 0.0910, 0.0694, 0.0520, 0.0515, 0.0502,
         0.0491, 0.0466, 0.0291, 0.0132, 0.0125, 0.0090, 0.0021]],
       dtype=torch.float64)

In [103]:
SAMPLED_DOMAIN_PAIR_COUNTS = defaultdict(list)

for _ in tqdm(range(100)):
    
    samples = torch.multinomial(DOMAIN_PRIOR.repeat(len(DOMAIN_PAIRS), 1), 2)
    
    pairs = []
    for i1, i2 in samples.tolist():
        pair = tuple(sorted([DOMAINS[i1], DOMAINS[i2]]))
        pairs.append(pair)
        
    counts = Counter(pairs)
    for p, count in counts.items():
        SAMPLED_DOMAIN_PAIR_COUNTS[p].append(count)

100%|██████████| 100/100 [02:18<00:00,  1.45s/it]


In [104]:
SAMPLED_DOMAIN_PAIR_COUNTS['breitbart.com', 'foxnews.com']

[3727,
 3690,
 3669,
 3761,
 3769,
 3697,
 3743,
 3737,
 3841,
 3686,
 3852,
 3707,
 3704,
 3720,
 3757,
 3813,
 3700,
 3656,
 3773,
 3749,
 3633,
 3631,
 3780,
 3644,
 3832,
 3753,
 3858,
 3836,
 3742,
 3711,
 3655,
 3762,
 3693,
 3707,
 3786,
 3907,
 3781,
 3683,
 3640,
 3676,
 3740,
 3800,
 3689,
 3809,
 3804,
 3716,
 3633,
 3679,
 3766,
 3796,
 3682,
 3628,
 3729,
 3681,
 3856,
 3682,
 3701,
 3711,
 3712,
 3659,
 3686,
 3806,
 3635,
 3716,
 3811,
 3703,
 3705,
 3618,
 3756,
 3875,
 3725,
 3765,
 3685,
 3789,
 3786,
 3818,
 3816,
 3747,
 3745,
 3647,
 3727,
 3623,
 3808,
 3700,
 3767,
 3764,
 3640,
 3777,
 3773,
 3808,
 3648,
 3851,
 3806,
 3729,
 3676,
 3772,
 3729,
 3732,
 3702,
 3692]

In [105]:
DOMAIN_PAIR_COUNTS['breitbart.com', 'foxnews.com']

20215

In [109]:
for pair, count in DOMAIN_PAIR_COUNTS.items():
    sampled_counts = SAMPLED_DOMAIN_PAIR_COUNTS[pair]
    print(pair, (count - np.mean(sampled_counts)) / np.std(sampled_counts))

('apnews.com', 'nytimes.com') 2.6618847081225394
('apnews.com', 'cnn.com') 37.308548315628755
('apnews.com', 'thehill.com') -6.344127336372311
('apnews.com', 'dailykos.com') -3.2761910719680576
('apnews.com', 'buzzfeed.com') -12.73124760672358
('apnews.com', 'foxnews.com') -16.632610722226154
('apnews.com', 'breitbart.com') -25.94957606268743
('apnews.com', 'huffingtonpost.com') -15.668123783459658
('apnews.com', 'washingtonpost.com') 0.6527807398223817
('apnews.com', 'bloomberg.com') -5.936291744666507
('apnews.com', 'npr.org') 5.434196810541422
('apnews.com', 'wsj.com') -3.0875708964750515
('apnews.com', 'dailycaller.com') -18.245038623009776
('apnews.com', 'rt.com') -5.9173764651981795
('apnews.com', 'sputniknews.com') -1.533514736012296
('bloomberg.com', 'huffingtonpost.com') -8.754877843643472
('bloomberg.com', 'npr.org') -7.767916968017553
('bloomberg.com', 'cnn.com') 20.506044478539373
('bloomberg.com', 'foxnews.com') -28.30670778629702
('bloomberg.com', 'buzzfeed.com') -6.43988