In [160]:
import pandas as pd

from cached_property import cached_property
from collections import Counter

from news_vec.utils import read_json_gz_lines

In [135]:
def sample_link_groups(df, groupby, n):
    rows = df.groupby(groupby).apply(lambda x: x.sample(n))
    return rows[['article_id', 'domain']].values.tolist()

In [183]:
def sample_groups(df, groupby, n):
    return df.groupby(groupby).apply(lambda x: x.sample(n))

In [189]:
def make_pairs(df):
    return df[['article_id', 'domain']].values.tolist()

In [205]:
class Links:
    
    def __init__(self, root):
        lines_iter = read_json_gz_lines('../data/clf-links.json/')
        self.df = pd.DataFrame(lines_iter)
        
    @cached_property
    def min_domain_count(self):
        return self.df.groupby('domain').count()['article_id'].min()

    @cached_property
    def min_domain_bucket_count(self):
        return self.df.groupby(['domain', 'ts_bucket']).count()['article_id'].min()
    
    def sample_all_vs_all(self):
        rows = sample_groups(self.df, 'domain', self.min_domain_count)
        return make_pairs(rows)
    
    def sample_a_vs_b(self, a, b):
        rows = self.df[self.df.domain.isin([a, b])]
        rows = sample_groups(rows, 'domain', self.min_domain_count)
        return make_pairs(rows)
    
    def sample_a_vs_b_ts(self, a, b, bucket):
        rows = self.df[self.df.domain.isin([a, b]) & self.df.ts_bucket==bucket]
        rows = sample_groups(rows, 'domain', self.min_domain_bucket_count)
        return make_pairs(rows)
    
    def sample_one_vs_all(self, a):
        rows = sample_groups(self.df, 'domain', self.min_domain_count)
        groupby = rows.domain.apply(lambda x: x if x == a else 'other')
        rows = sample_groups(rows, groupby, self.min_domain_count)
        return make_pairs(rows)
    
    def sample_one_vs_all_ts(self, a, bucket):
        rows = self.df[self.df.ts_bucket==bucket]
        rows = sample_groups(rows, 'domain', self.min_domain_bucket_count)
        groupby = rows.domain.apply(lambda x: x if x == a else 'other')
        rows = sample_groups(rows, groupby, self.min_domain_bucket_count)
        return make_pairs(rows)

In [206]:
links = Links('../data/clf-links.json/')

In [192]:
links.min_domain_count

29518

In [193]:
links.min_domain_bucket_count

2686

In [194]:
len(links.sample_all_vs_all())

472288

In [195]:
Counter(d for _, d in links.sample_all_vs_all())

Counter({'apnews.com': 29518,
         'bloomberg.com': 29518,
         'breitbart.com': 29518,
         'buzzfeed.com': 29518,
         'cnn.com': 29518,
         'dailycaller.com': 29518,
         'dailykos.com': 29518,
         'foxnews.com': 29518,
         'huffingtonpost.com': 29518,
         'npr.org': 29518,
         'nytimes.com': 29518,
         'rt.com': 29518,
         'sputniknews.com': 29518,
         'thehill.com': 29518,
         'washingtonpost.com': 29518,
         'wsj.com': 29518})

In [196]:
len(links.sample_a_vs_b('nytimes.com', 'apnews.com'))

59036

In [197]:
Counter(d for _, d in links.sample_a_vs_b('nytimes.com', 'apnews.com'))

Counter({'apnews.com': 29518, 'nytimes.com': 29518})

In [198]:
len(links.sample_a_vs_b_ts('nytimes.com', 'apnews.com', 1))

5372

In [199]:
Counter(d for _, d in links.sample_a_vs_b_ts('nytimes.com', 'apnews.com', 1))

Counter({'apnews.com': 2686, 'nytimes.com': 2686})

In [200]:
len(links.sample_one_vs_all('nytimes.com'))

59036

In [201]:
Counter(d for _, d in links.sample_one_vs_all('foxnews.com'))

Counter({'foxnews.com': 29518,
         'buzzfeed.com': 1836,
         'rt.com': 1936,
         'bloomberg.com': 1997,
         'thehill.com': 1956,
         'apnews.com': 1942,
         'dailycaller.com': 2025,
         'washingtonpost.com': 1975,
         'wsj.com': 2065,
         'sputniknews.com': 1940,
         'nytimes.com': 1963,
         'dailykos.com': 2065,
         'huffingtonpost.com': 1960,
         'npr.org': 1957,
         'cnn.com': 1915,
         'breitbart.com': 1986})

In [207]:
len(links.sample_one_vs_all_ts('nytimes.com', 1))

5372

In [208]:
Counter(d for _, d in links.sample_one_vs_all_ts('foxnews.com', 1))

Counter({'foxnews.com': 2686,
         'rt.com': 193,
         'apnews.com': 186,
         'npr.org': 158,
         'sputniknews.com': 166,
         'breitbart.com': 172,
         'dailykos.com': 168,
         'nytimes.com': 193,
         'huffingtonpost.com': 176,
         'buzzfeed.com': 187,
         'bloomberg.com': 184,
         'washingtonpost.com': 194,
         'wsj.com': 178,
         'cnn.com': 175,
         'thehill.com': 171,
         'dailycaller.com': 185})