In [9]:
from hwr.lm.generate_lm import update_counter
from hwr.lm.lm import KneserNeyBackoff, KneserNeyInterpolated, MLE, StupidBackoff
from nltk.lm import NgramCounter, Vocabulary
from nltk.util import everygrams
from hwr.constants import DATA

In [10]:
# Create a 3 gram KN smoothed stupid back off model
# with only the word 'dog'
counter = NgramCounter()
test_text = 'dog'
everygram = list(everygrams(test_text, max_len=3))
counter = NgramCounter()
counter.update([everygram])
everygram

[('d',), ('o',), ('g',), ('d', 'o'), ('o', 'g'), ('d', 'o', 'g')]

In [11]:
s = 'helloworld'
w = '1'

In [12]:
knbo = KneserNeyBackoff(3, backoff=0.5, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
kn = KneserNeyInterpolated(3, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
mle = MLE(3, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
bo = StupidBackoff(3, backoff=0.5, counter=counter, vocabulary=Vocabulary(DATA.CHARS))

lms = [knbo, kn, mle, bo]

In [14]:
for lm in lms:
    # unigram scores are same for KN smoothing
    print(type(lm).__name__)
    print(lm.score('g'), lm.score('d'), lm.score('o'), lm.score('k'))
    print()

KneserNeyBackoff
0.012048192771084338 0.012048192771084338 0.012048192771084338 0.012048192771084338

KneserNeyInterpolated
0.012048192771084338 0.012048192771084338 0.012048192771084338 0.012048192771084338

MLE
0.3333333333333333 0.3333333333333333 0.3333333333333333 0.0

StupidBackoff
0.3333333333333333 0.3333333333333333 0.3333333333333333 0.0



In [55]:
for lm in lms:
    # P(g|do), P(g|o)
    p_g_do = lm.score('g', ['d','o'])
    p_g_o = lm.score('g', ['o'])
    print(type(lm).__name__)  
    print(p_g_do, p_g_o)
    print()

KneserNeyBackoff
0.9901204819277108 0.9012048192771085

KneserNeyInterpolated
0.9901204819277108 0.9012048192771085

MLE
1.0 1.0

StupidBackoff
1.0 1.0



In [56]:
for lm in lms:
    #"no occurance of "fog", so by stupid backoff
    # P(g|fo) = 0.5 * P(g|o)
    # both "bag" and "ag" has no occurence,
    # P(g|ba) = 0.5 ^ 2 * P(g)
    p_g_fo = lm.score('g', ['f','o'])
    p_g_ba = lm.score('g', ['b','a'])
    p_g_fo, p_g_ba
    print(type(lm).__name__)
    print(p_g_fo, p_g_ba)
    print()

KneserNeyBackoff
0.45060240963855425 0.0030120481927710845

KneserNeyInterpolated
0 0

MLE
0 0

StupidBackoff
0.5 0.08333333333333333



In [92]:
# update counter to 9 gram with the example text
counter = NgramCounter()
update_counter(counter, 9, '../data/1blm/corpus_example.txt')
knbo = KneserNeyBackoff(9, backoff=0.5, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
kn = KneserNeyInterpolated(9, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
mle = MLE(9, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
bo = StupidBackoff(9, backoff=0.5, counter=counter, vocabulary=Vocabulary(DATA.CHARS))
lms = [knbo, kn, mle, bo]

 33%|███▎      | 3/9 [00:00<00:00, 19.50it/s]

Updating counter with file:
../data/1blm/corpus_example.txt
Updating ngrams:


100%|██████████| 9/9 [00:00<00:00,  7.26it/s]


In [94]:
#P(e|th) and P(k|th) and P(e|ph)
for lm in lms:
    print(type(lm).__name__) 
    print(lm.score('e', list('th')), lm.score('k', list('th')), lm.score('e', list('xh')))
    print()

KneserNeyBackoff
0.7029019364108138 0.0003268392719815432 0.23897932154148507

KneserNeyInterpolated
0.7029019364108138 0 0

MLE
0.7022696929238985 0.0 0.0

StupidBackoff
0.7022696929238985 0.0003546099290780142 0.23900709219858157

