In [160]:
import pandas as pd

from cached_property import cached_property
from collections import Counter

from news_vec.utils import read_json_gz_lines

In [183]:
def sample_groups(df, groupby, n):
    return df.groupby(groupby).apply(lambda x: x.sample(n))

In [189]:
def make_pairs(df):
    return df[['article_id', 'domain']].values.tolist()

In [319]:
class Corpus:
    
    def __init__(self, root):
        links_iter = read_json_gz_lines(root)
        self.links = pd.DataFrame(links_iter)
        
    @cached_property
    def unique_articles(self):
        return self.links[['domain', 'article_id']].drop_duplicates()
        
    @cached_property
    def min_domain_count(self):
        """Smallest number of unique articles per domain.
        """
        return self.unique_articles.groupby('domain').size().min()
    
    def sample_all_vs_all(self):
        """Sample evenly from all domains.
        """
        rows = (self.unique_articles
            .groupby('domain')
            .apply(lambda x: x.sample(self.min_domain_count)))
        
        return make_pairs(rows)
    
    def sample_a_vs_b(self):
        """Sample evenly from two domains.
        """
        rows = (self.unique_articles
            [self.unique_articles.domain.isin([a, b])]
            .groupby('domain')
            .apply(lambda x: x.sample(self.min_domain_count)))
        
        return make_pairs(rows)

In [320]:
corpus = Corpus('../data/clf-links.json/')

In [321]:
links.min_domain_count

29185

In [323]:
ava = links.sample_all_vs_all()

In [324]:
len(ava)

466960

In [325]:
Counter(d for _, d in ava)

Counter({'apnews.com': 29185,
         'bloomberg.com': 29185,
         'breitbart.com': 29185,
         'buzzfeed.com': 29185,
         'cnn.com': 29185,
         'dailycaller.com': 29185,
         'dailykos.com': 29185,
         'foxnews.com': 29185,
         'huffingtonpost.com': 29185,
         'npr.org': 29185,
         'nytimes.com': 29185,
         'rt.com': 29185,
         'sputniknews.com': 29185,
         'thehill.com': 29185,
         'washingtonpost.com': 29185,
         'wsj.com': 29185})

In [326]:
avb = links.sample_a_vs_b('nytimes.com', 'rt.com')

In [327]:
len(avb)

58370

In [328]:
Counter(d for _, d in avb)

Counter({'nytimes.com': 29185, 'rt.com': 29185})