# Language Models

Let's start with a simple, Laplace-smoothed trigram model:

In [82]:
from collections import defaultdict
import numpy as np
import nltk

smoothing = 0.001
counts = defaultdict(lambda: defaultdict(lambda: smoothing))

corpus = [line.strip().split() for line in open('../data/moby_dick.txt')]

for sentence in corpus:
    tokens = ['*', '*'] + sentence + ['STOP']
    for u, v, w in nltk.ngrams(tokens, 3):
        counts[(u, v)][w] += 1

def logP(u, v, w):
    return np.log(counts[(u, v)][w]) - np.log(sum(counts[(u, v)].values()))

def sentence_logP(S):
    tokens = ['*', '*'] + S + ['STOP']
    return sum([logP(u, v, w) for u, v, w in nltk.ngrams(tokens, 3)])

We can now score arbitrary sentences:

In [85]:
sentence_logP('Captain Ahab is a white whale .'.split())

-29.31730693419735

## Generation

We can re-use the counts to generate language:

In [29]:
def sample_next_word(u, v):
    keys, values = zip(*counts[(u, v)].items())
    values = np.array(values)
    values /= values.sum() # create probability distro
    sample = np.random.multinomial(1, values) # pick one position
    return keys[np.argmax(sample)]

def generate():
    result = ['*', '*']
    next_word = sample_next_word(result[-2], result[-1])
    result.append(next_word)
    while next_word != 'STOP':
        next_word = sample_next_word(result[-2], result[-1])
        result.append(next_word)

    return ' '.join(result[2:-1])

In [92]:
l1 = list('abc')
l2 = [1,2,3,4,5,6]

l3 = dict(zip(l1, l2))
l3

{'a': 1, 'b': 2, 'c': 3}

In [96]:
a, b = zip(*l3.items())
a

('a', 'b', 'c')

In [122]:
keys, values = zip(*l3.items())
values = np.array(values)
print(values)
values = values / values.sum()
print(values)
sample = np.random.multinomial(1, values)
print(sample)
selection = np.argmax(sample)
print(selection)
keys[selection]

[1 2 3]
[0.16666667 0.33333333 0.5       ]
[0 0 1]
2


'c'

In [133]:
sample_next_word('as', 'a'), counts[('as', 'a')]

('sailor',
 defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
             {'passenger': 4.0009999999999994,
              'general': 6.0009999999999994,
              'Commodore': 1.001,
              'simple': 1.001,
              'country': 1.001,
              'sailor': 3.001,
              'merchant': 1.001,
              'sort': 3.001,
              'looker': 1.001,
              'clam': 1.001,
              'wash': 1.001,
              'particular': 1.001,
              'pike': 1.001,
              'pilot': 3.001,
              'model': 1.001,
              'single': 2.001,
              'young': 1.001,
              'slave': 1.001,
              'substitute': 1.001,
              'rather': 2.001,
              'candidate': 1.001,
              'harpooneer': 1.001,
              'head': 2.001,
              'journeyman': 1.001,
              'whistling': 1.001,
              'picked': 1.001,
              'giraffe': 1.001,
              'set': 1.001,
              's

We can now generate non-sensical sentences:

In [81]:
generate()

'For when three days he was so afraid of black eyes that he is never hunted .'

## Exercise

Extend the code above to arbitray $n$-gram sizes. Use another corpus to try it with $n=4$.

It might be helpful to use a `class` for the LM, make the smoothing a parameter, `counts` a class property, and add a function `fit()`.

In [138]:
# Your code here


N = 4
START = '*'
STOP = 'STOP'
counts = defaultdict(lambda: defaultdict(lambda: smoothing))

for sentence in corpus:
    tokens = [START] * (N-1) + sentence + [STOP]
    for ngram in nltk.ngrams(tokens, N):
        counts[ngram[:-1]][ngram[-1]] += 1

def logP(history, w):
    return np.log(counts[history][w]) - np.log(sum(counts[history].values()))

def sentence_logP(S):
    tokens = [START] * (N-1) + S + [STOP]
    return sum([logP(ngram[:-1], ngram[-1]) for ngram in nltk.ngrams(tokens, N)])


In [142]:
sentence_logP('Captain Ahab is a white whale .'.split())

-23.822143780660614