# A vs. the - next-word log-likelihood ratios

- Count up which words come after "a" and "the" in the first and last 5% of novels. This gives 4 sets of counts - `a_beginning`, `a_end`, `the_beginning`, `the_end`.
- Then, using Dunning's log-likelihood ratio, we can get words that follow a/the distinctively in one of these contexts. Eg, words that follow "a" distinctively in the first 5% relative to "the."

In [57]:
import attr
import os
import ujson
import bz2
import random

import pandas as pd

from glob import glob
from tqdm import tqdm
from multiprocessing import Pool
from itertools import islice
from functools import partial
from boltons.iterutils import pairwise
from collections import Counter
from scipy import stats

from IPython.display import display, Markdown

In [2]:
def zip_offset(seq):
    """Yield (item, 0-1 offset).
    """
    size = len(seq)
    for i, item in enumerate(seq):
        offset = i / (size - 1) if (size - 1) else 0
        yield item, offset

In [3]:
def map_segment(func, path):
    """Parse JSON segment, apply worker function.
    """
    results = []
    
    with bz2.open(path) as fh:
        for line in fh:
            results.append(func(ujson.loads(line)))
            
    return results

In [4]:
@attr.s
class Corpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
                        
    def map_novels(self, func, shuffle=True):
        """Apply a worker to segment files in parallel.
        """
        paths = self.paths()
        
        if shuffle:
            random.shuffle(paths)
        
        with Pool() as p:
            
            worker = partial(map_segment, func)
            
            for results in p.imap_unordered(worker, paths):
                yield from results

In [5]:
corpus = Corpus('../../data/chicago-bins-tokens.json/')

In [17]:
def suffixes_worker(q, o1, o2, n):
    results = []
    
    for (t1, t2), offset in zip_offset(pairwise(n['tokens'])):
        if o1 < offset < o2 and t1.lower() == q:
            results.append(t2.lower())
    
    return results

In [37]:
def suffixes(q, o1, o2, num_novels=None):
    worker = partial(suffixes_worker, q, o1, o2)
    res_iter = islice(corpus.map_novels(worker), num_novels)
    return Counter([m for ms in tqdm(res_iter) for m in ms])

In [38]:
a0_ = suffixes('a', 0, 0.05)

6638it [03:50, 28.78it/s]


In [40]:
a1_ = suffixes('a', 0.95, 1)

6638it [03:46, 29.26it/s]


In [41]:
the0_ = suffixes('the', 0, 0.05)

6638it [03:50, 28.85it/s]


In [42]:
the1_ = suffixes('the', 0.95, 1)

6638it [03:44, 29.59it/s]


In [65]:
def mdw(fg, bg, min_count=100, n=50):

    vocab = set.intersection(
        {t for t, c in fg.items() if c > min_count},
        {t for t, c in bg.items() if c > min_count},
    )
    
    n_fg = sum(fg[t] for t in vocab)
    n_bg = sum(bg[t] for t in vocab)
    
    rows = []
    for t in vocab:
        
        p = (fg[t] + bg[t]) / (n_fg + n_bg)
        
        e_fg = n_fg * p
        e_bg = n_bg * p
        
        if fg[t] > e_fg:

            s, _ = stats.power_divergence(
                [fg[t], bg[t]],
                [e_fg, e_bg],
                lambda_='log-likelihood',
            )

            rows.append((t, s))
            
    return pd.DataFrame(rows, columns=('token', 'dll'))

# a > the (beginning)

In [76]:
' '.join(mdw(a0_, the0_).sort_values('dll', ascending=False).head(100).token)

'few little lot good bit moment couple long while hundred small dozen minute single large week pair half nice thousand year month piece different quick smile very short fine hand pretty bad drink deep chance series man quarter fool look sudden slight brief cigarette strange step child great special full new woman strong finger rather private mistake beautiful pleasant wonderful joke person loud big real note sense hint wave copy cup thin - mere simple thing sharp low huge lovely faint terrible handsome number part better row tall fresh damn complete tight curious breath vague visit happy professional slow flash'

# the > a (beginning)

In [77]:
' '.join(mdw(the0_, a0_).sort_values('dll', ascending=False).head(100).token)

'first door last back world floor house rest front sun room two way water street most kitchen road one top people window sky morning wind table city middle right night river wall phone side left time car bed sea fact rain captain town king land police whole beach hell war gate building village doorway crowd desk three color sound smell station lord roof bar subject blood bedroom church hill thought far bridge light body head lake work country ship sight yard fire train company corner power park name store university future pain hospital deck corridor forest boy scene queen hotel'

# a > the (end)

In [78]:
' '.join(mdw(a1_, the1_).sort_values('dll', ascending=False).head(100).token)

'few little lot moment good long while couple minute hundred small single week deep friend large half pair hand chance piece man sudden smile drink step very great quick short month year fine fool pretty new bad look strange mistake brief woman cigarette slight finger lie quarter wonderful terrible low child part better special loud sense dream beautiful big huge strong private damn - thing copy full joke breath cup kind simple stranger hero tight wave flash person word hard faint sharp slow thin lovely fresh miracle fair shot note break number kiss happy visit curious mere perfect clear gesture'

# the > a (end)

In [79]:
' '.join(mdw(the1_, a1_).sort_values('dll', ascending=False).head(100).token)

'door first room house back last two rest world one front water road way sky people kitchen night top street window morning wind most police bed middle side table right city wall car river phone rain whole sea crowd pain doorway fact three hospital beach fire blood moon church hell head king hill power bridge building land desk light future time body sound wrong screen dark town sight law country gun lake judge bar couch village thought far ship garden hotel work yard queen truck war four forest smell boat radio face boy scene name blade universe bank stage company'

# Notes

- "A" is clearly used in the context of description - many of the distinctive a>the words are adjectives, whereas in the the>a lists, nouns.
- Of the a>the adjectives, interesting that many of the most distinctive ones are quantifiers, markers of _degree_ - few, little, lot, bit, hundred, single, small, large.
- "A" also associated with time, questions of when / how long - while, minute, week, year.
- "The," meanwhile, is clearly marking what might be thought of as "physical rendering" - descriptions of physical settings and spatial relationships. "Sides" of things - front, back, middle, side; and literal objects and locations - door, floor, house, room, street, kitchen, road, etc.
- So basically, my gloss - "a" is description (and temporality?), "the" is physicality. At the beginning, both are common - things are getting introduced for the first time, the physical setting is getting established. Whereas, at the end, there's a return to the physical (away from the psychological / dialogic middle?), with "the" going back up; but, less need for "a," since the world has already been described.

---
Less useful - "a" (beginning) compared to "a" (end), vice versa, and "the" vs "the." This basically just reproduces the overall frequency differences, though. (Murder at the end, etc.)

In [80]:
' '.join(mdw(a0_, a1_).sort_values('dll', ascending=False).head(100).token)

'year tall young large boy girl small well pair half handsome month student “ week two high narrow thin big town face lady white three city pleasant brown broad widow job name fat house four - slender six black decade blue thick private school beauty five habit natural day row gold summer square dollar first female wide successful country dark husband dozen constant century street teacher fine block pale local certain particularly slim genius frown model detective twenty slight famous delicate friendly slightly kid variety particular bar social shade middle cow quarter full vague land writer rather living rare desk'

In [81]:
' '.join(mdw(a1_, a0_).sort_values('dll', ascending=False).head(100).token)

'moment while long gun murderer minute chance step shot lie bullet terrible deal way hero last deep message lot hand letter little witness roar trap bitch voice dream final weapon sudden time nurse second kiss pistol fool part plan sound cry killer chair whisper scream great trial promise noise horrible choice new rifle fist traitor hug monster fake wonderful decision tear breath blanket flash shotgun guard silence will criminal tremendous word suicide life liar thing move sitting mistake grave split question death desperate few deadly crazy trick blessing prayer plane shadow court miracle saint brave search difference threat light fucking'

In [82]:
' '.join(mdw(the0_, the1_).sort_values('dll', ascending=False).head(100).token)

'town girl young age family boy year city summer university school stranger youngest older man local most color usual village “ shop younger oldest neighborhood name street big west early woman store girls wide driver high north english college teacher owner victim sort size tall more counter class bar - small thick kind largest weather students narrow irish war gentleman land large rich middle occasional subject county late broad job thin winter few houses guy office daughter boys boss corners smell cook impression third bartender wagon royal male waitress south average fact old habit bus spring art mirror meat coach'

In [83]:
' '.join(mdw(the1_, the0_).sort_values('dll', ascending=False).head(100).token)

'gun truth trigger pistol police knife door murder hospital rifle bullet jury murderer pain helicopter floor sword ground stairs revolver shotgun bed flames tears dragon cave end barrel others fire shot room blade baby darkness judge story grave tunnel tape weapon one night strength sheriff money muzzle future coffin cemetery evidence wound car battle final nurse light courtroom sky cabin key rope poison will blow couch bomb hammer altar guards airport trial words flood enemy force earth wedding power steps void attack circle universe whole charges explosion way bodies chapel ceremony funeral truck president soldiers fight rest inspector lord sound'