In [19]:
import ujson
import math

import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter
from scipy import stats

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
def zip_offset(seq):
    """Yield (item, 0-1 offset).
    """
    size = len(seq)
    for i, item in enumerate(seq):
        offset = i / (size - 1) if (size - 1) else 0
        yield item, offset

In [4]:
def zip_bin(seq, bin_count):
    """Yield (item, bin)
    """
    for item, offset in zip_offset(seq):
        bin = math.floor(offset * bin_count) if offset < 1 else bin_count - 1
        yield item, bin

In [5]:
class Document:
    
    def __init__(self, id, sents):
        self.id = id
        self.sents = sents
        
    def __repr__(self):
        return '%s<%d sentences>' % (self.__class__.__name__, len(self.sents))
    
    def tokens(self):
        return [t.lower() for s in self.sents for t in s]

In [6]:
def read_movie_summaries(path):
    with open(path) as fh:
        for line in tqdm(fh):
            data = ujson.loads(line)
            yield Document(data['id'], data['tokens'])

In [7]:
class Corpus:
    
    @classmethod
    def from_movie_summaries(cls, path):
        return cls(list(read_movie_summaries(path)))
    
    def __init__(self, docs):
        self.docs = docs
        
    def token_counts(self):
        return Counter([
            token for doc in self.docs 
            for token in doc.tokens()
        ])

In [8]:
c = Corpus.from_movie_summaries('../../data/movies.json')

42304it [00:03, 11667.12it/s]


In [9]:
counts = c.token_counts()

In [10]:
counts.most_common(10)

[('the', 824480),
 ('to', 481288),
 ('and', 456748),
 ('a', 376026),
 ('of', 261445),
 ('is', 225446),
 ('in', 220059),
 ('his', 196672),
 ('he', 180322),
 ('her', 151180)]

In [42]:
vocab = [t for t, _ in counts.most_common(5000)]
vtoi = {v: i for i, v in enumerate(vocab)}

In [91]:
bin_counts = np.zeros((len(vocab), 10))

In [92]:
for doc in tqdm(c.docs):
    for token, bi in zip_bin(doc.tokens(), bin_counts.shape[1]):
        if token in vtoi:
            bin_counts[vtoi[token]][bi] += 1

100%|██████████| 42304/42304 [00:19<00:00, 2115.56it/s]


In [93]:
rows = []
for vi, token in enumerate(tqdm(vocab)):
    for bi in range(bin_counts.shape[1]):
        
        b_total = bin_counts[:,bi].sum()
        
        c_obs = bin_counts[vi][bi]
        c_exp = bin_counts[vi].sum() / bin_counts.shape[1]
        
        s, _ = stats.power_divergence(
            [c_obs, b_total-c_obs],
            [c_exp, b_total-c_exp],
            lambda_='log-likelihood',
        )
        
        rows.append((token, bi, c_obs-c_exp, s))

100%|██████████| 5000/5000 [00:06<00:00, 750.00it/s]


In [94]:
df = pd.DataFrame(rows, columns=('token', 'bin', 'delta', 'score'))

# 5 bins

In [90]:
for bi in range(bin_counts.shape[1]):
    bdf = df[(df.bin==bi) & (df.delta>0)].sort_values('score', ascending=False)
    print(' '.join(bdf.head(20).token))

a film young story an is in who of named plot opens lives small school years revolves war living college
meets a who named local meet friend soon becomes next day initially night befriends job agrees her daughter attracted training
to that meanwhile they but 's she him her when becomes he help soon them find discovers next tries so
to that him but and kill she 's tries out realizes he then the confronts back them escape finally reveals
ends the end and final finally cite film credits web scene last movie back all shoots ending then reunited as


# 10 bins

In [95]:
for bi in range(bin_counts.shape[1]):
    bdf = df[(df.bin==bi) & (df.delta>0)].sort_values('score', ascending=False)
    print(' '.join(bdf.head(20).token))

a film story young is in plot an opens of set revolves lives named who small movie war living american
a who named his an friend young years local meets school of daughter job called day work college family woman
a meets named who local job daughter an friend initially school meet his day first some hires befriends next strange
meets becomes soon to they her she training night next local meet meanwhile day when investigate friend agrees impressed pay
to that meanwhile they 's soon when becomes next she her find but later help night discover he increasingly steal
to that but she him meanwhile 's they her when he them finds tries becomes help so discovers out room
to that 's but him she them her and tries he plan out kill finds not then confronts when however
to him and that but kill the finally realizes fight out back she gun tries escape then reveals truth he
the finally and him kill then shoots final back end but gun save fight to kills realizes last fire manages
ends cite film end we