# Simple 3-gram Language Modeling

The sentence start character is `<s>` while the end character is `</s>`. As it is a 3-gram model, it needs to pad 2 `<s>` at the beginning of the sentence.

In [12]:
import os, sys, re, json, time
import itertools, collections
import dill
import numpy as np

# Helper libraries for this notebook
import ngram_lm
import ngram_utils
from shared_lib import utils, vocabulary

Load the model

In [3]:
with open('model/lm.pkl', 'rb') as f:
    lm = dill.load(f)

## Predict Probability

In [47]:
def get_top_n(context, n):
    if n > 50000:
        print 'Exceed the vocabulary size 50000!'
        return
    
    probs = [lm.next_word_proba(word, context) for word in lm.words]
    combined = zip(lm.words, probs)
    top_n = sorted(combined, key=lambda t: t[1], reverse=True)[:n]
    return [word_prob[0].decode('utf-8') for word_prob in top_n]

In [48]:
# Given the first 2 words
context = ["<s>", "<s>"]
result = get_top_n(context, 5)

for word in result:
    print word

我
你
他
但
不


In [49]:
# Given the first 2 words
context = ["你", "好"]
result = get_top_n(context, 5)

for word in result:
    print word

，
。
吗
像
？


## Sampling Sentences

In [4]:
max_length = 20
num_sentences = 5

for _ in range(num_sentences):
    seq = ["<s>", "<s>"]
    for i in range(max_length):
        seq.append(ngram_utils.predict_next(lm, seq))
        # Stop at end-of-sentence.
        if seq[-1] == "</s>": break
    print " ".join(seq)
    print "[{1:d} tokens; log P(seq): {0:.02f}]".format(*ngram_utils.score_seq(lm, seq))
    print ""

<s> <s> 他 在 监 视 机 和 咖 啡 师 . . </s>
[11 tokens; log P(seq): -55.13]

<s> <s> 感 情 跟 踪 我 。 不 可 能 是 - 没 错 。 </s>
[14 tokens; log P(seq): -74.85]

<s> <s> 像 一 颗 星 球 最 快 增 长 。 </s>
[10 tokens; log P(seq): -51.88]

<s> <s> 在 报 告 上 说 道 ， 但 是 星 星 从 尤 文 ， 埃 伦 向 我 招
[20 tokens; log P(seq): -120.26]

<s> <s> 今 天 我 不 能 傻 坐 着 休 闲 和 工 作 ， </s>
[14 tokens; log P(seq): -64.60]



## Scoring on Test Data

Load and proprocess

In [7]:
utils.tokenizeChinese("data/valid.zh", "data/valid.tok")

In [8]:
sent_arr = []
for line in open("data/valid.tok", "r").readlines():
    line = line.strip()
    words = line.split()
    sent_arr.append(words)
    
sentences = np.asarray(sent_arr)
print sentences.shape

(8000,)


In [24]:
def sents_to_tokens(sents):
    """Returns an flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([utils.canonicalize_word(w, wordset=lm.words) 
                     for w in utils.flatten(padded_sentences)], dtype=object)

test_tokens = sents_to_tokens(sentences)

Score

In [25]:
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, test_tokens)
print "Test perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))

Test perplexity: 39.26
