In [27]:
import attr
import os
import ujson
import bz2
import random

import pandas as pd

from glob import glob
from tqdm import tqdm
from multiprocessing import Pool
from itertools import islice
from functools import partial
from boltons.iterutils import pairwise
from collections import Counter
from scipy import stats
from sklearn.model_selection import train_test_split

from IPython.display import display, Markdown

In [2]:
def zip_offset(seq):
    """Yield (item, 0-1 offset).
    """
    size = len(seq)
    for i, item in enumerate(seq):
        offset = i / (size - 1) if (size - 1) else 0
        yield item, offset

In [3]:
def map_segment(func, path):
    """Parse JSON segment, apply worker function.
    """
    results = []
    
    with bz2.open(path) as fh:
        for line in fh:
            results.append(func(ujson.loads(line)))
            
    return results

In [4]:
@attr.s
class Corpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
                        
    def map_novels(self, func, shuffle=True):
        """Apply a worker to segment files in parallel.
        """
        paths = self.paths()
        
        if shuffle:
            random.shuffle(paths)
        
        with Pool() as p:
            
            worker = partial(map_segment, func)
            
            for results in p.imap_unordered(worker, paths):
                yield from results

In [5]:
corpus = Corpus('../../data/chicago-bins-tokens.json/')

In [113]:
def _match_suffixes(root, group, o1, o2, novel, width=5):
    """Match suffix windows after a query token.
    """
    results = []
    
    for i, (t, offset) in enumerate(zip_offset(novel['tokens'])):
        if o1 < offset < o2 and t.lower() == root:
            
            suffix = novel['tokens'][i+1:i+1+width]
            
            results.append((novel['identifier'], group, root, suffix))
    
    return results

In [104]:
def match_suffixes(root, group, o1, o2, width=5, skim=None):
    
    worker = partial(_match_suffixes, root, group, o1, o2, width=width)

    results = corpus.map_novels(worker)
    results = tqdm(islice(results, skim))

    return [r for b in results for r in b]

In [117]:
a0 = match_suffixes('a', 0, 0, 0.05, skim=1000)

1000it [00:35, 28.34it/s]


In [118]:
a1 = match_suffixes('a', 1, 0.95, 1, skim=1000)

1000it [00:28, 34.67it/s]


In [119]:
the0 = match_suffixes('the', 0, 0, 0.05, skim=1000)

1000it [00:32, 30.76it/s]


In [120]:
the1 = match_suffixes('the', 1, 0.95, 1, skim=1000)

1000it [00:57, 17.37it/s]


In [121]:
rows = a0 + a1 + the0 + the1
df = pd.DataFrame(rows, columns=('chicago_id', 'group', 'root', 'suffix'))

In [122]:
df

Unnamed: 0,chicago_id,group,root,suffix
0,3504,0,a,"[dispassionate, ,, though, sympathetic, ,]"
1,3504,0,a,"[whole, ;, each, has, taken]"
2,3504,0,a,"[composite, and, proportionate, presentation, of]"
3,3504,0,a,"[fairly, correct, idea, of, what]"
4,3504,0,a,"[long, time, been, and, is]"
5,3504,0,a,"[sphinx, to, the, whites, .]"
6,3504,0,a,"[veil, had, been, drawn, aside]"
7,3504,0,a,"[view, of, the, inner, life]"
8,3504,0,a,"[pressure, which, ,, in, New]"
9,3504,0,a,"[glimpse, behind, the, scenes, of]"


In [123]:
df0 = df[df.group==0]

train0, test0 = train_test_split(df0)

train0.to_json('train0.json', orient='records', lines=True)
test0.to_json('test0.json', orient='records', lines=True)

In [124]:
df1 = df[df.group==1]

train1, test1 = train_test_split(df1)

train1.to_json('train1.json', orient='records', lines=True)
test1.to_json('test1.json', orient='records', lines=True)

In [125]:
len(df0)

429708

In [126]:
len(df1)

393167