# Language Models

In [272]:
import numpy as np

In [273]:
X = """
<s> a a b b c c </s>
<s> a c b c </s>
<s> b c c a b </s>
"""
print(X)


<s> a a b b c c </s>
<s> a c b c </s>
<s> b c c a b </s>



In [274]:
X.split("\n")

['', '<s> a a b b c c </s>', '<s> a c b c </s>', '<s> b c c a b </s>', '']

In [275]:
def parse_data(data):
    data_as_list = []
    for r in data.split("\n"):
        if len(r)!=0:
            
            data_as_list.append(r.split(" "))
            
    return data_as_list

In [276]:
dataset = parse_data(X)

In [277]:
dataset[0]

['<s>', 'a', 'a', 'b', 'b', 'c', 'c', '</s>']

In [278]:
def get_vocabulary(dataset):
    vocab = set()
    for r in dataset:
        for x in r:
            vocab.add(x)
    return vocab

In [279]:
vocabulary = get_vocabulary(dataset)

In [280]:
START = "<s>"
STOP  = "</s>"
vocabulary = vocabulary - set([START, STOP])

## Unigrams

In [281]:
def compute_probs_unigrams(dataset, vocab):
    w_counts = {w:0 for w in vocab}
    for r in dataset:
        for w in r:
            if w in w_counts:
                w_counts[w] +=1
    return w_counts

In [282]:
w_counts = compute_probs_unigrams(dataset, vocabulary)
w_counts

{'b': 5, 'a': 4, 'c': 6}

In [283]:
def compute_probs_unigrams(w_counts):
    n_words = np.sum(list(w_counts.values())) 
    w_probs = {w:0 for w in w_counts}
    for w,c in w_counts.items():
        w_probs[w] = c/n_words
    return w_probs

In [284]:
w_probs = compute_probs_unigrams(w_counts)

In [285]:
# a: 0.26666666666666666 b: 0.3333333333333333 c: 0.4 UNK: 0.0 
w_probs

{'b': 0.3333333333333333, 'a': 0.26666666666666666, 'c': 0.4}

## Corpus creation with `<s>` and `</s>`


Create corpus as a list of sentences

In [286]:
list_of_sequences = X.split("\n")[1:]
list_of_sequences = [l for l in list_of_sequences if len(l)!=0]

In [287]:
list_of_sequences

['<s> a a b b c c </s>', '<s> a c b c </s>', '<s> b c c a b </s>']

## Make a unigram model

Fill in the functions to have a working UniGrams class

In [288]:

from __future__ import division
from collections import defaultdict as ddict
import itertools
import math
import random

class UniGrams(object):
    
    def __init__(self):
        self.n_tokens = 0
        self._counts = ddict(lambda: 0)

    def partial_update(self, words):
        # FILL IN THIS
    
    def word_probability(self, word):
        if isinstance(word, str):
            return self._counts.get((word, ))/self.n_tokens
        if isinstance(word, tuple):
            return self._counts.get(word)/self.n_tokens
    
    def compute_probs(self):
        # FILL IN THIS
        return w_probs
    
    def update_counts_given_corpus(self, list_of_sequences, tokenizer=lambda x: x.split(" ")):
        # FILL IN THIS



We can iterate over sequences

In [289]:
unigram_model = UniGrams()

for seq in list_of_sequences:
    unigram_model.partial_update(seq.split(" "))

In [290]:
unigram_model._counts

defaultdict(<function __main__.UniGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 3, ('a',): 4, ('b',): 5, ('c',): 6, ('</s>',): 3})

In [291]:
unigram_model.word_probability("<s>")

0.14285714285714285

In [292]:
type("<s>")

str

In [293]:
unigram_model.word_probability(("<s>",))

0.14285714285714285

In [294]:
w_probs = unigram_model.compute_probs()
assert np.sum(list(w_probs.values())), "This should add up to one"

In [295]:
len(("<s>",)), len("<s>"), len(("<s>"))

(1, 3, 3)

## Unigram: add method to compute the probability of a sequence

In [297]:

from __future__ import division
from collections import defaultdict as ddict
import itertools
import math
import random

class UniGrams(object):
    
    def __init__(self):
        self.n_tokens = 0
        self._counts = ddict(lambda: 0)

    def partial_update(self, words):
        # FILL IN THIS
    
    def word_probability(self, word):
        if isinstance(word, str):
            return self._counts.get((word, ))/self.n_tokens
        if isinstance(word, tuple):
            return self._counts.get(word)/self.n_tokens
    
    def compute_probs(self):
        # FILL IN THIS
        return w_probs
    
    def update_counts_given_corpus(self, list_of_sequences, tokenizer=lambda x: x.split(" ")):
        # FILL IN THIS



    def sequence_probability(self, word_sequence):
        assert isinstance(word_sequence, list),\
                "type(sequence)={}, it should be a list".format(type(word_sequence))
       # FILL IN THIS

    def _probability(self, unigram):
        # FILL IN THIS
        #unigram_count =
        #prefix_count = 
               
        


In [298]:
unigram_model = UniGrams()

for seq in list_of_sequences:
    unigram_model.partial_update(seq.split(" "))

In [299]:
unigram_model.sequence_probability(["<s>","a","</s>"])

0.0038872691933916417

## NGram class creation 

Make an update function that for a given sequence of tokens computes the ngrams of the sequence and updates the counts in the internal `._counts` default dict

In [442]:

from __future__ import division
from collections import defaultdict as ddict
import itertools
import math
import random

class NGrams(object):
    def __init__(self, max_n, words=None):
        
        assert max_n>=1, "max_n={}, it should be >= 1".format(max_n)
        
        self._max_n   = max_n
        self._n_range = range(1, max_n + 1)
        self._counts  = ddict(lambda: 0)
        self.n_tokens = 0
        

    def partial_update(self, words):
        # FILL IN THIS




In [443]:
ngrams_builder = NGrams(max_n=1)
for seq in list_of_sequences:
    ngrams_builder.partial_update(seq.split(" "))

In [444]:
ngrams_builder._counts == ngrams_builder._counts

True

### Try bigrams
Let us inspect the bigram counts

In [445]:
print(X)

['<s> This is the malt </s>', '<s> That lay in the house that Jack build </s>']


In [446]:
bigram_model = NGrams(max_n=2)
for seq in list_of_sequences:
    bigram_model.partial_update(seq.split(" "))

In [462]:
bigram_model._counts

defaultdict(<function __main__.NGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 2,
             ('<s>', 'This'): 1,
             ('This',): 1,
             ('This', 'is'): 1,
             ('is',): 1,
             ('is', 'the'): 1,
             ('the',): 2,
             ('the', 'malt'): 1,
             ('malt',): 1,
             ('malt', '</s>'): 1,
             ('</s>',): 2,
             ('<s>', 'That'): 1,
             ('That',): 1,
             ('That', 'lay'): 1,
             ('lay',): 1,
             ('lay', 'in'): 1,
             ('in',): 1,
             ('in', 'the'): 1,
             ('the', 'house'): 1,
             ('house',): 1,
             ('house', 'that'): 1,
             ('that',): 1,
             ('that', 'Jack'): 1,
             ('Jack',): 1,
             ('Jack', 'build'): 1,
             ('build',): 1,
             ('build', '</s>'): 1})

## Update NGrams:  computing the probability of a sequence

how do you compute `p(('<s>', 'a')) = c('<s>', 'a')/('a')`


In [536]:

from __future__ import division
from collections import defaultdict as ddict
import itertools
import math
import random

class NGrams(object):
    def __init__(self, max_n, words=None):
        
        assert max_n>=1, "max_n={}, it should be >= 1".format(max_n)
        
        self._max_n   = max_n
        self._n_range = range(1, max_n + 1)
        self._counts  = ddict(lambda: 0)
        self.n_tokens = 0
        


    def partial_update(self, words):
        # FILL IN THIS


    
    def ngram_probability(self, ngram):
        assert isinstance(ngram, tuple),\
            "type(ngram)={}, it should be a tuple of strings".format(typle(ngram))

        # FILL IN THIS



    def sequence_probability(self, word_sequence):
        assert isinstance(word_sequence, list),\
                "type(sequence)={}, it should be a list".format(type(word_sequence))
            
        # FILL IN THIS



In [537]:
bigram_model = NGrams(max_n=2)
for seq in list_of_sequences:
    bigram_model.partial_update(seq.split(" "))

In [538]:
bigram_model.ngram_probability(('<s>','a'))

0.6666666666666666

In [539]:
bigram_model.ngram_probability(('<s>','<s>'))

0.0

Another example

In [540]:
X2 = ["<s> This is the malt </s>", "<s> That lay in the house that Jack build </s>"]

In [541]:
bigram_model = NGrams(max_n=2)
for seq in X2:
    bigram_model.partial_update(seq.split(" "))

In [542]:
bigram_model._counts

defaultdict(<function __main__.NGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 2,
             ('<s>', 'This'): 1,
             ('This',): 1,
             ('This', 'is'): 1,
             ('is',): 1,
             ('is', 'the'): 1,
             ('the',): 2,
             ('the', 'malt'): 1,
             ('malt',): 1,
             ('malt', '</s>'): 1,
             ('</s>',): 2,
             ('<s>', 'That'): 1,
             ('That',): 1,
             ('That', 'lay'): 1,
             ('lay',): 1,
             ('lay', 'in'): 1,
             ('in',): 1,
             ('in', 'the'): 1,
             ('the', 'house'): 1,
             ('house',): 1,
             ('house', 'that'): 1,
             ('that',): 1,
             ('that', 'Jack'): 1,
             ('Jack',): 1,
             ('Jack', 'build'): 1,
             ('build',): 1,
             ('build', '</s>'): 1})

In [543]:
bigram_model.ngram_probability(('<s>', 'This')) 

0.5

In [544]:
bigram_model.ngram_probability(('This', 'is')) 

1.0

In [545]:
bigram_model.sequence_probability(['<s>', 'This', 'is', 'the']) 

0.5

In [546]:
bigram_model.sequence_probability(['<s>', 'This', 'is', 'the','house']) 

0.25

In [547]:
bigram_model.sequence_probability(['<s>', 'This', 'is', 'the','house','</s>']) 

0.0

Another example


In [557]:
X3 ="""This is the house that Jack built
This is the malt
That lay in the house that Jack built
This is the rat
That ate the malt
That lay in the house that Jack built
This is the cat
That killed the rat
That ate the malt
That lay in the house that Jack build
"""

In [558]:
ngrams_model3 = NGrams(max_n=2)
X3_sequences = X3.split("\n")
for seq in X3_sequences:
    if seq != "":
        ngrams_model3.partial_update(seq.split(" "))

In [568]:
ngrams_model3._counts.get(("the", "house")), ngrams_model3._counts.get(("the",))

(4, 10)

In [566]:
ngrams_model.ngram_probability(("the", "house"), smoothing=None)

0.5

## Update  NGrams:   Laplace smoothing


Problem: give too much probability mass to unseen n-grams.

  For sparse sets of data over large vocabularies, such as n-grams, Laplace's law actually gives far too much of the probability space to unseen events.
  
Let N be the number of words seen in the corpus (adding start and stop at every example and taking them into account as words).

For 1-grams
$$
p_{\mathrm{lap}} (w_i) = \frac{1 + c(w_i)}{ \text{vocab_size} + \text{n_tokens} }
$$
For 2-grams
$$
p_{\mathrm{lap}}  (w_i \vert w_{i-1}) = \frac{1 + c(w_{i-1}, w_i)}{ \text{vocab_size}  + c(w_{i-1})}
$$

For n-grams
$$
p_{\mathrm{lap}}  (w_n \vert w_{1}, \dots ,w_{n-1}) = \frac{1 + c(w_{1}, \dots ,w_{n})}{ \text{vocab_size} + c(w_{1}, \dots ,w_{n-1})}
$$




In [496]:

class NGrams(object):
    def __init__(self, max_n, words=None):
        
        assert max_n>=1, "max_n={}, it should be >= 1".format(max_n)
        
        self._max_n   = max_n
        self._n_range = range(1, max_n + 1)
        self._counts  = ddict(lambda: 0)
        self.n_tokens = 0
        
    def partial_update(self, words):
        # FILL IN THIS



    def sequence_probability(self, word_sequence, smoothing="None"):
        # FILL IN THIS



    def ngram_probability(self, ngram, smoothing):
        # FILL IN THIS 




In [497]:
ngrams_model = NGrams(max_n=2)
for seq in X2:
    if seq != "":
        ngrams_model.partial_update(seq.split(" "))
        
ngrams_model._counts

defaultdict(<function __main__.NGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 2,
             ('<s>', 'This'): 1,
             ('This',): 1,
             ('This', 'is'): 1,
             ('is',): 1,
             ('is', 'the'): 1,
             ('the',): 2,
             ('the', 'malt'): 1,
             ('malt',): 1,
             ('malt', '</s>'): 1,
             ('</s>',): 2,
             ('<s>', 'That'): 1,
             ('That',): 1,
             ('That', 'lay'): 1,
             ('lay',): 1,
             ('lay', 'in'): 1,
             ('in',): 1,
             ('in', 'the'): 1,
             ('the', 'house'): 1,
             ('house',): 1,
             ('house', 'that'): 1,
             ('that',): 1,
             ('that', 'Jack'): 1,
             ('Jack',): 1,
             ('Jack', 'build'): 1,
             ('build',): 1,
             ('build', '</s>'): 1})

#### Computing probabilities of sequences of tokens

In [502]:
ngrams_model.sequence_probability(['<s>', 'This', 'is', 'the','house','</s>'],
                                  smoothing="None")

0.0

In [503]:
ngrams_model.sequence_probability(['<s>', 'This', 'is', 'the','house','</s>'],
                                  smoothing="Laplace")

1.0051438235168473e-05

#### Pay attention with defaultdicts!!

If d is a `defaultdict` then `d[w]` it will append `w` as a new key if `w` not in `d`.

Don't do this

```
 [ngrams_builder._probability(x,smoothing="Laplace") for x in ngrams_builder._counts]
 ```
If you want a value, use `.get` 

```
 [ngrams_builder._probability(x,smoothing="Laplace") for x in ngrams_builder._counts]
 ```

In [504]:
ngrams_model._counts[122]

0

In [505]:
ngrams_model._counts

defaultdict(<function __main__.NGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 2,
             ('<s>', 'This'): 1,
             ('This',): 1,
             ('This', 'is'): 1,
             ('is',): 1,
             ('is', 'the'): 1,
             ('the',): 2,
             ('the', 'malt'): 1,
             ('malt',): 1,
             ('malt', '</s>'): 1,
             ('</s>',): 2,
             ('<s>', 'That'): 1,
             ('That',): 1,
             ('That', 'lay'): 1,
             ('lay',): 1,
             ('lay', 'in'): 1,
             ('in',): 1,
             ('in', 'the'): 1,
             ('the', 'house'): 1,
             ('house',): 1,
             ('house', 'that'): 1,
             ('that',): 1,
             ('that', 'Jack'): 1,
             ('Jack',): 1,
             ('Jack', 'build'): 1,
             ('build',): 1,
             ('build', '</s>'): 1,
             ('house', '</s>'): 0,
             122: 0})

In [506]:
ngrams_model._counts.get(123)

In [507]:
ngrams_model._counts

defaultdict(<function __main__.NGrams.__init__.<locals>.<lambda>()>,
            {('<s>',): 2,
             ('<s>', 'This'): 1,
             ('This',): 1,
             ('This', 'is'): 1,
             ('is',): 1,
             ('is', 'the'): 1,
             ('the',): 2,
             ('the', 'malt'): 1,
             ('malt',): 1,
             ('malt', '</s>'): 1,
             ('</s>',): 2,
             ('<s>', 'That'): 1,
             ('That',): 1,
             ('That', 'lay'): 1,
             ('lay',): 1,
             ('lay', 'in'): 1,
             ('in',): 1,
             ('in', 'the'): 1,
             ('the', 'house'): 1,
             ('house',): 1,
             ('house', 'that'): 1,
             ('that',): 1,
             ('that', 'Jack'): 1,
             ('Jack',): 1,
             ('Jack', 'build'): 1,
             ('build',): 1,
             ('build', '</s>'): 1,
             ('house', '</s>'): 0,
             122: 0})

## Update NGrams:  Lidstone smoothing


Add an option0 `smoothing="Lidstone" ` to `sequence_probability` and `ngram_probability`
so that probabilities can be smoothed as follows:

$$
p_{\mathrm{Lid}}(w_i \vert w_{i-1}) = \frac{  c(w_i, w_{i-1}) + \epsilon}{c(w_i, w_{i-1}) + \epsilon*\text{vocab_size} }
$$

## Update NGrams: Computing the probability of a sequence with UNK

Add the option to use unkown words and still get probabilities