In [31]:
import numpy as np

from itertools import islice
from tqdm import tqdm
from cached_property import cached_property
from collections import defaultdict

from news_vec.utils import read_json_gz_lines
from news_vec import logger

In [27]:
class Actors:
    
    def __init__(self, root, skim=None):
        self.root = root
        self.skim = skim
        
    def rows_iter(self):
        reader = islice(read_json_gz_lines(self.root), self.skim)
        yield from tqdm(reader)
        
    @cached_property
    def domains(self):
        """List of unique domains.
        """
        logger.info('Gathering domains.')

        ds = set()
        for row in self.rows_iter():
            ds.add(row['domain'])
        return list(ds)
    
    @cached_property
    def dtoi(self):
        return {d: i for i, d in enumerate(self.domains)}
    
    @cached_property
    def domain_counts(self):
        """Raw domain counts.
        """
        logger.info('Gathering domain counts.')
        
        counts = np.zeros(len(self.dtoi))
        for row in self.rows_iter():
            counts[self.dtoi[row['domain']]] += 1
            
        return counts
    
    @cached_property
    def domain_count_prior(self):
        return self.domain_counts / self.domain_counts.sum()

In [28]:
ac = Actors('../data/clf-actors.json', 1000000)

In [29]:
ac.domains

2018-12-27 15:17:13,263 | INFO : Gathering domains.
1000000it [00:03, 269301.65it/s]


['huffingtonpost.com',
 'rt.com',
 'dailykos.com',
 'npr.org',
 'apnews.com',
 'nytimes.com',
 'buzzfeed.com',
 'bloomberg.com',
 'cnn.com',
 'thehill.com',
 'sputniknews.com',
 'dailycaller.com',
 'washingtonpost.com',
 'breitbart.com',
 'wsj.com',
 'foxnews.com']

In [30]:
ac.domain_count_prior

2018-12-27 15:17:21,521 | INFO : Gathering domain counts.
1000000it [00:03, 277115.79it/s]


array([0.066948, 0.00608 , 0.019093, 0.041495, 0.054748, 0.092408,
       0.054731, 0.073328, 0.208534, 0.042455, 0.001863, 0.049823,
       0.046672, 0.107206, 0.013431, 0.121185])

In [32]:
actor_domain = defaultdict(set)
for row in ac.rows_iter():
    actor_domain[row['actor_id']].add(row['domain'])

1000000it [00:04, 243078.92it/s]


In [33]:
actor_domain

defaultdict(set,
            {'id:twitter.com:2793391472': {'apnews.com',
              'cnn.com',
              'dailykos.com',
              'huffingtonpost.com',
              'npr.org',
              'nytimes.com'},
             'id:twitter.com:1957266158': {'apnews.com',
              'cnn.com',
              'nytimes.com',
              'washingtonpost.com'},
             'id:twitter.com:1617146054': {'apnews.com', 'nytimes.com'},
             'id:twitter.com:3170408384': {'apnews.com'},
             'id:twitter.com:48200575': {'apnews.com',
              'cnn.com',
              'huffingtonpost.com',
              'nytimes.com',
              'thehill.com',
              'washingtonpost.com',
              'wsj.com'},
             'id:twitter.com:155640818': {'apnews.com',
              'cnn.com',
              'dailykos.com',
              'foxnews.com',
              'nytimes.com'},
             'id:twitter.com:1928290020': {'apnews.com',
              'bloomberg.com',
       