In [1]:
import pandas as pd

from cached_property import cached_property
from collections import Counter, UserList, UserDict
from tqdm import tqdm

from news_vec.utils import read_json_gz_lines
from news_vec import logger

In [2]:
class Headline(UserDict):

    def __repr__(self):

        pattern = '{cls_name}<{token_count} tokens -> {domain}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            token_count=len(self['clf_tokens']),
            domain=self['domain'],
        )

In [3]:
class HeadlineDataset(UserList):

    def token_counts(self):
        """Collect all token -> count.
        """
        logger.info('Gathering token counts.')

        counts = Counter()
        for hl, _ in tqdm(self):
            counts.update(hl['tokens'])

        return counts

    def label_counts(self):
        """Label -> count.
        """
        logger.info('Gathering label counts.')

        counts = Counter()
        for _, label in tqdm(self):
            counts[label] += 1

        return counts

    def labels(self):
        counts = self.label_counts()
        return [label for label, _ in counts.most_common()]

In [18]:
class Corpus:
    
    def __init__(self, links_root, headlines_root):
        """Read links df, article index.
        """
        logger.info('Reading links.')
        
        rows = list(tqdm(read_json_gz_lines(links_root)))
        self.links = pd.DataFrame(rows)
        
        logger.info('Reading headlines.')
        
        self.headlines = {
            row['article_id']: Headline(row)
            for row in tqdm(read_json_gz_lines(headlines_root))
        }
        
    def make_dataset(self, df):
        """Index out a list of (Headline, domain) pairs.
        """
        pairs = df[['article_id', 'domain']].values.tolist()
        
        return HeadlineDataset([
            (self.headlines[aid], domain)
            for aid, domain in pairs
        ])
        
    @cached_property
    def unique_articles(self):
        return self.links[['domain', 'article_id']].drop_duplicates()
        
    @cached_property
    def min_domain_count(self):
        """Smallest number of unique articles per domain.
        """
        return self.unique_articles.groupby('domain').size().min()
    
    def sample_all_vs_all(self):
        """Sample evenly from all domains.
        """
        rows = (self.unique_articles
            .groupby('domain')
            .apply(lambda x: x.sample(self.min_domain_count)))
        
        return self.make_dataset(rows)
    
    def sample_a_vs_b(self, a, b):
        """Sample evenly from two domains.
        """
        rows = (self.unique_articles
            [self.unique_articles.domain.isin([a, b])]
            .groupby('domain')
            .apply(lambda x: x.sample(self.min_domain_count)))
        
        return self.make_dataset(rows)

In [19]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-23 12:53:28,994 | INFO : Reading links.
1225511it [00:03, 373703.63it/s]
2018-12-23 12:53:34,370 | INFO : Reading headlines.
1127502it [00:24, 45425.64it/s]


In [20]:
corpus.min_domain_count

29185

In [21]:
ava = corpus.sample_all_vs_all()

In [22]:
len(ava)

466960

In [23]:
ava.label_counts()

2018-12-23 12:54:08,716 | INFO : Gathering label counts.
100%|██████████| 466960/466960 [00:00<00:00, 876466.11it/s]


Counter({'apnews.com': 29185,
         'bloomberg.com': 29185,
         'breitbart.com': 29185,
         'buzzfeed.com': 29185,
         'cnn.com': 29185,
         'dailycaller.com': 29185,
         'dailykos.com': 29185,
         'foxnews.com': 29185,
         'huffingtonpost.com': 29185,
         'npr.org': 29185,
         'nytimes.com': 29185,
         'rt.com': 29185,
         'sputniknews.com': 29185,
         'thehill.com': 29185,
         'washingtonpost.com': 29185,
         'wsj.com': 29185})

In [24]:
avb = corpus.sample_a_vs_b('nytimes.com', 'rt.com')

In [25]:
len(avb)

58370

In [27]:
avb.label_counts()

2018-12-23 12:54:20,121 | INFO : Gathering label counts.
100%|██████████| 58370/58370 [00:00<00:00, 732213.75it/s]


Counter({'nytimes.com': 29185, 'rt.com': 29185})

In [28]:
avb[:10]

[(Headline<13 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<6 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<11 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<4 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<10 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<4 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<7 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<4 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<5 tokens -> nytimes.com>, 'nytimes.com'),
 (Headline<4 tokens -> nytimes.com>, 'nytimes.com')]