In [33]:
import warnings
import gzip
import ujson

import pandas as pd
import numpy as np

from glob import glob
from itertools import islice
from tqdm import tqdm
from boltons.iterutils import pairwise
from functools import reduce

from news_vec import logger

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
warnings.simplefilter('ignore')

In [4]:
class Line:

    def __init__(self, tokens, label, lower=True):
        self.tokens = [t.lower() for t in tokens] if lower else tokens
        self.label = label

    def __repr__(self):

        pattern = '{cls_name}<{token_count} tokens -> {label}>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            token_count=len(self.tokens),
            label=self.label,
        )

In [5]:
def read_json_lines(root, lower=True):
    """Generate links from a JSON corpus.

    Yields: list<str>
    """
    for path in glob('%s/*.gz' % root):
        with gzip.open(path) as fh:
            for line in fh:

                data = ujson.loads(line)

                tokens = data.get('tokens')

                if not tokens:
                    continue

                yield Line(tokens, data['label'], lower=lower)

In [6]:
class Corpus:

    def __init__(self, root, skim=None, lower=True):
        """Read lines.
        """
        logger.info('Parsing line corpus.')

        lines_iter = islice(read_json_lines(root, lower), skim)

        self.lines = list(tqdm(lines_iter))

    def __repr__(self):

        pattern = '{cls_name}<{line_count} lines>'

        return pattern.format(
            cls_name=self.__class__.__name__,
            line_count=len(self),
        )

    def __len__(self):
        return len(self.lines)

    def __iter__(self):
        return iter(self.lines)

In [7]:
c = Corpus('../data/titles-50k.json/', 10000)

2018-11-28 19:24:24,415 | INFO : Parsing line corpus.
10000it [00:00, 92926.53it/s]


In [23]:
SEP_TOKENS = {':', '-', '–', '—', '|', 'via', '[', ']'}

def scrub_paratext(tokens):
    """Try to prune out "paratext" around headlines. Hacky.
    """
    sep_idxs = [
        i for i, t in enumerate(tokens)
        if t.lower() in SEP_TOKENS
    ]

    if not sep_idxs:
        return tokens

    if sep_idxs[0] != 0:
        sep_idxs = [-1] + sep_idxs

    if sep_idxs[-1] != len(tokens)-1:
        sep_idxs = sep_idxs + [len(tokens)]

    widths = [
        (i1, i2, i2-i1)
        for i1, i2 in pairwise(sep_idxs)
    ]

    widths = sorted(
        widths,
        key=lambda x: x[2],
        reverse=True,
    )

    i1 = widths[0][0]+1
    i2 = widths[0][1]

    return tokens[i1:i2]

In [46]:
CURLY_STRAIGHT = (('“', '"'), ('”', '"'), ('‘', "'"), ('’', "'"))

def uncurl_quotes(text):
    """Curly -> straight.
    """
    for c, s in CURLY_STRAIGHT:
        text = text.replace(c, s)
        
    return text

In [47]:
QUOTES = {'\'', '"'}

def scrub_quotes(tokens):
    return [t for t in tokens if uncurl_quotes(t) not in QUOTES]

In [52]:
def clean_headline(tokens):
    tokens = scrub_paratext(tokens)
    tokens = scrub_quotes(tokens)
    return tokens

In [53]:
for l in c.lines[:500]:
    
    tokens = clean_headline(l.tokens)
    
    print(l.label, '|', ' '.join(tokens))

wsj.com | the new moneyball , with lots and lots of money
nytimes.com | singer aaron carter , girlfriend arrested in georgia
washingtontimes.com | reinstate chelsea manning as fellow
rawstory.com | disease outbreaks begin in puerto rico even as trump attacks its citizens on twitter
cnn.com | trump has heated exchange with australian leader , sources say
theguardian.com | argentinian raid finds country's largest haul of nazi artefacts
bbc.co.uk | pasta recipes
rawstory.com | joe kennedy's shiny lips during sotu response mocked
thetimes.co.uk | the best silver jewellery
msn.com | scared but defiant , barcelona marches to reclaim city from terrorists
washingtonpost.com | trump called the news media an enemy of the american people . here s a history of the term .
foxnews.com | house intelligence committee nearing end of russia probe
breitbart.com | the process that elected trump was not legitimate
usatoday.com | bills sign cb lorenzo doss to practice squad after thanksgiving fallout with b