In [1]:
from hwr.lm.generate_lm import update_counter
from hwr.lm.lm import KneserNeyBackoff
from nltk.lm import NgramCounter, Vocabulary
from nltk.util import everygrams
from hwr.constants import ON

Small demo for the character level language model, which is used in the decoding of RNN output.

In [2]:
# Create a 3 gram KN smoothed stupid back off model
# with only the word 'dog'
counter = NgramCounter()
test_text = 'dog'
everygram = list(everygrams(test_text, max_len=3))
counter = NgramCounter()
counter.update([everygram])
lm = KneserNeyBackoff(3, backoff=0.5, counter=counter, vocabulary=Vocabulary(ON.DATA.CHARS))
everygram

[('d',), ('o',), ('g',), ('d', 'o'), ('o', 'g'), ('d', 'o', 'g')]

In [3]:
# unigram scores are same for KN smoothing
lm.score('g'), lm.score('d'), lm.score('o'), lm.score('k')

(0.012048192771084338,
 0.012048192771084338,
 0.012048192771084338,
 0.012048192771084338)

In [4]:
# P(g|do), P(g|o)
p_g_do = lm.score('g', ['d','o'])
p_g_o = lm.score('g', ['o'])
p_g_do, p_g_o

(0.9901204819277108, 0.9012048192771085)

In [5]:
#"no occurance of "fog", so by stupid backoff
# P(g|fo) = 0.5 * P(g|o)
# both "bag" and "ag" has no occurence,
# P(g|ba) = 0.5 ^ 2 * P(g)
p_g_fo = lm.score('g', ['f','o'])
p_g_ba = lm.score('g', ['b','a'])
p_g_fo, p_g_ba

(0.45060240963855425, 0.0030120481927710845)

In [6]:
# update counter to 9 gram with the example text
counter = NgramCounter()
update_counter(counter, 9, '../../data/1blm/lm_example.txt')
lm = KneserNeyBackoff(9, backoff=0.4, counter=counter, vocabulary=Vocabulary(ON.DATA.CHARS))

  0%|          | 0/9 [00:00<?, ?it/s]

Updating counter with file:
../../data/1blm/lm_example.txt
Updating ngrams:


100%|██████████| 9/9 [00:00<00:00,  7.00it/s]


In [7]:
#P(e|th) and P(k|th)
lm.score('e', list('th')), lm.score('k', list('th'))

(0.7029019364108138, 0.00026147141758523456)