In [217]:
import pandas as pd

from collections import UserDict, UserList, Counter
from tqdm import tqdm
from cached_property import cached_property
from itertools import islice, chain
from functools import lru_cache
from torch.utils.data import random_split

from news_vec.utils import read_json_gz_lines
from news_vec import logger

In [40]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [213]:
class Corpus:

    def __init__(self, headline_root, skim=None):
        """Read headline df.
        """
        logger.info('Reading headlines.')
        
        lines = islice(read_json_gz_lines(headline_root), skim)
        self.df = pd.DataFrame(list(tqdm(lines)))

    def __repr__(self):

        pattern = '{cls_name}<{hl_count} headlines>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            hl_count=len(self.df),
        )
    
    @cached_property
    def min_db_count(self):
        return self.df.groupby(['domain', 'ts_bucket']).size().min()
    
    def sample_all_vs_all(self):
        return self.df.groupby(['domain', 'ts_bucket']).apply(lambda x: x.sample(self.min_db_count))
    
    @lru_cache(None)
    def filter_ab(self, d1, d2):
        return self.df[self.df.domain.isin([d1, d2])].groupby(['domain', 'ts_bucket'])
    
    def sample_ab(self, d1, d2):
        return self.filter_ab(d1, d2).apply(lambda x: x.sample(self.min_db_count))
    
    @lru_cache(None)
    def filter_ab_ts(self, d1, d2, bucket):
        return self.df[self.df.domain.isin([d1, d2])&(self.df.ts_bucket==bucket)].groupby('domain')
    
    def sample_ab_ts(self, d1, d2, bucket):
        return self.filter_ab_ts(d1, d2, bucket).apply(lambda x: x.sample(self.min_db_count))

In [226]:
class HeadlineDataset(UserList):
    
    @classmethod
    def from_df(cls, df, label_col='domain', **kwargs):
        pairs = [(d, d[label_col]) for d in df.to_dict('records')]
        return cls(pairs, **kwargs)

    def __init__(self, pairs, test_frac=0.1):
        """Set train/val/test splits.
        """
        test_size = round(len(pairs) * test_frac)
        train_size = len(pairs) - (test_size * 2)

        sizes = (train_size, test_size, test_size)
        self.train, self.val, self.test = random_split(pairs, sizes)

    def __iter__(self):
        return chain(self.train, self.val, self.test)

    def __repr__(self):

        pattern = '{cls_name}<{train_size}/{val_size}/{test_size}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            train_size=len(self.train),
            val_size=len(self.val),
            test_size=len(self.test),
        )

In [224]:
c = Corpus('../data/clf-headlines.json/', None)

2018-12-27 12:20:11,566 | INFO : Reading headlines.
1225511it [00:44, 27273.37it/s] 


In [227]:
c.min_db_count

2686

In [234]:
ds = HeadlineDataset.from_df(c.sample_ab_ts('nytimes.com', 'apnews.com', 0))

In [235]:
ds

HeadlineDataset<4298/537/537>

In [239]:
%time c.sample_ab_ts('nytimes.com', 'apnews.com', 0).head(10)

CPU times: user 19.2 ms, sys: 1.83 ms, total: 21 ms
Wall time: 19.4 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,article_id,clf_tokens,domain,impressions,tokens,ts_bucket
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apnews.com,932722,601295464600,"[airstrikes, killed, #, is, group, linked, mil...",apnews.com,12257,"[Philippines, :, Airstrikes, killed, 15, IS, g...",0
apnews.com,1125824,1365799605635,"[somaliland, asks, us, for, exemption, to, tru...",apnews.com,37083,"[Somaliland, asks, US, for, exemption, to, Tru...",0
apnews.com,1021123,1236950591462,"[the, most, unique, one, day, job, in, sports]",apnews.com,116313,"[Emergency, goalie, :, The, most, unique, one,...",0
apnews.com,673189,77309416169,"[baby, chimp, abandoned, by, mom, to, join, ka...",apnews.com,51349,"[Baby, chimp, abandoned, by, mom, to, join, Ka...",0
apnews.com,202236,283467877822,"[de, sacerdotes, australianos, acusados, de, a...",apnews.com,77516,"[7, %, de, sacerdotes, australianos, ,, acusad...",0
apnews.com,326673,867583417162,"[chris, brown, to, box, soulja, boy, over, soc...",apnews.com,13605,"[Chris, Brown, to, box, Soulja, Boy, over, soc...",0
apnews.com,1120554,1614907722435,"[travis, snider, agrees, to, minor, league, de...",apnews.com,108875,"[Travis, Snider, agrees, to, minor, league, de...",0
apnews.com,213621,60129561728,"[torres, taken, to, hospital, after, head, inj...",apnews.com,12135,"[Torres, taken, to, hospital, after, head, inj...",0
apnews.com,297547,1271310335695,"[japan, pm, pledges, unchanging, allegiance, t...",apnews.com,207968,"[Japan, PM, pledges, ', unchanging, ', allegia...",0
apnews.com,18645,111669154341,"[new, leader, orders, justice, to, do, our, sw...",apnews.com,36419,"[The, Latest, :, New, leader, orders, Justice,...",0


In [241]:
%time c.sample_ab('nytimes.com', 'apnews.com').head(10)

CPU times: user 170 ms, sys: 8.25 ms, total: 178 ms
Wall time: 177 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,article_id,clf_tokens,domain,impressions,tokens,ts_bucket
domain,ts_bucket,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
apnews.com,0,1153273,343597392855,"[lyon, s, defense, folds, again, in, #, #, def...",apnews.com,109828,"[Lyon, 's, defense, folds, again, in, 2, 1, de...",0
apnews.com,0,320551,824633721167,"[at, least, #, killed, in, ethnic, clashes, in...",apnews.com,23807,"[Govt, :, At, least, 13, killed, in, ethnic, c...",0
apnews.com,0,907780,472446440775,"[china, suspends, coal, imports, from, north, ...",apnews.com,21711,"[China, suspends, coal, imports, from, North, ...",0
apnews.com,0,440927,171798698465,"[walker, s, state, of, state, to, include, com...",apnews.com,23575,"[AP, News, :, Walker, 's, State, of, State, to...",0
apnews.com,0,130997,670014922942,"[senate, confirms, trump, s, nominee, for, us,...",apnews.com,168999,"[Senate, confirms, Trump, 's, nominee, for, US...",0
apnews.com,0,603477,1133871395779,"[bonds, clemens, making, slow, gains, with, ch...",apnews.com,30405,"[Bonds, ,, Clemens, making, slow, gains, with,...",0
apnews.com,0,1128338,335007495429,"[chimera, greiss, lead, islanders, to, #, #, w...",apnews.com,109803,"[Chimera, ,, Greiss, lead, Islanders, to, 3, 1...",0
apnews.com,0,197996,833223679361,"[the, audacity, of, hype]",apnews.com,11204,"[AP, FACT, CHECK, :, The, audacity, of, hype]",0
apnews.com,0,24626,51539645070,"[protests, erupt, after, kyrgyzstan, arrests, ...",apnews.com,46876,"[Protests, erupt, after, Kyrgyzstan, arrests, ...",0
apnews.com,0,1000069,549755816476,"[senators, want, materials, saved, for, russia...",apnews.com,26212,"[AP, source, :, Senators, want, materials, sav...",0
