In [8]:
import ujson
import math

import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter
from scipy import stats
from textblob import TextBlob

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [9]:
def tokenize(text):
    tb = TextBlob(text)
    return [[str(t) for t in s.words] for s in tb.sentences]

In [3]:
def zip_offset(seq):
    """Yield (item, 0-1 offset).
    """
    size = len(seq)
    for i, item in enumerate(seq):
        offset = i / (size - 1) if (size - 1) else 0
        yield item, offset

In [4]:
def zip_bin(seq, bin_count):
    """Yield (item, bin)
    """
    for item, offset in zip_offset(seq):
        bin = math.floor(offset * bin_count) if offset < 1 else bin_count - 1
        yield item, bin

In [26]:
def read_storycorps(path):
    with open(path) as fh:
        data = ujson.load(fh)
        for url, md in data.items():
            if len(md.get('written_summary', '')) > 1000:
                tokens = tokenize(md['written_summary'])
                yield Document(url, tokens)

In [27]:
class Document:
    
    def __init__(self, id, sents):
        self.id = id
        self.sents = sents
        
    def __repr__(self):
        return '%s<%d sentences>' % (self.__class__.__name__, len(self.sents))
    
    def tokens(self):
        return [t.lower() for s in self.sents for t in s]

In [28]:
class Corpus:
    
    @classmethod
    def from_storycorps(cls, path):
        return cls(list(read_storycorps(path)))
    
    def __init__(self, docs):
        self.docs = docs
        
    def token_counts(self):
        return Counter([
            token for doc in self.docs 
            for token in doc.tokens()
        ])

In [29]:
c = Corpus.from_storycorps('../../data/story_corps_archive_content.json')

In [30]:
len(c.docs)

667

In [31]:
counts = c.token_counts()

In [32]:
counts.most_common(10)

[('the', 4645),
 ('and', 4094),
 ('to', 2979),
 ('in', 2960),
 ('of', 2618),
 ('a', 2289),
 ('her', 1783),
 ('i', 1501),
 ('was', 1454),
 ('he', 1432)]

In [33]:
vocab = [t for t, _ in counts.most_common(1000)]
vtoi = {v: i for i, v in enumerate(vocab)}

In [39]:
bin_counts = np.zeros((len(vocab), 5))

In [40]:
for doc in tqdm(c.docs):
    for token, bi in zip_bin(doc.tokens(), bin_counts.shape[1]):
        if token in vtoi:
            bin_counts[vtoi[token]][bi] += 1

100%|██████████| 667/667 [00:00<00:00, 3500.54it/s]


In [41]:
rows = []
for vi, token in enumerate(tqdm(vocab)):
    for bi in range(bin_counts.shape[1]):
        
        b_total = bin_counts[:,bi].sum()
        
        c_obs = bin_counts[vi][bi]
        c_exp = bin_counts[vi].sum() / bin_counts.shape[1]
        
        s, _ = stats.power_divergence(
            [c_obs, b_total-c_obs],
            [c_exp, b_total-c_exp],
            lambda_='log-likelihood',
        )
        
        rows.append((token, bi, c_obs-c_exp, s))

100%|██████████| 1000/1000 [00:00<00:00, 1672.50it/s]


In [42]:
df = pd.DataFrame(rows, columns=('token', 'bin', 'delta', 'score'))

In [43]:
for bi in range(bin_counts.shape[1]):
    bdf = df[(df.bin==bi) & (df.delta>0)].sort_values('score', ascending=False)
    print(' '.join(bdf.head(20).token))

in interview november conducted interviews my this interviewed about 16 2016 2015 born was 15 grandfather i han 2017 his
the his and during aircraft talks humor to as comfortable he a smart transition ability amazing sense adults soon being
he also hardest together they the and to under she been which as nuckolls festival why given of has poor
end artists same strong strength bomb base loves forties for training 1940s presidential election ma think presented street gave minute
places organizations keywords participants n/a university school thegreatlisten2016 san texas new music marriage united war ca washington folk love il
