# Simple 3-gram Language Modeling

In [27]:
import os, sys, re, json, time
import itertools, collections
import dill
import numpy as np

# Helper libraries for this notebook
import ngram_lm
import ngram_utils
from shared_lib import utils, vocabulary

## Loading & Preprocessing

### Segmentation per character

In [2]:
def tokenizeChinese(input_file, output_file):
    with open(output_file, "w") as target_file:
        for line in open(input_file, "r").readlines():
            line = line.strip().decode("utf-8")
            output = ''
            for c in line:
                if c.encode('utf-8').isalnum():
                    if len(output) > 0 and not output[-1].isalnum():
                        output += ' '
                    output += c.encode('utf-8')
                else:
                    output += ' '+c.encode('utf-8')
            target_file.write(' '.join(output.split()) + "\n")

In [3]:
tokenizeChinese("data/train.zh", "data/train.tok")

### Load sentences

In [4]:
sent_arr = []
for line in open("data/train.tok", "r").readlines():
    line = line.strip()
    words = line.split()
    sent_arr.append(words)
    
sentences = np.asarray(sent_arr)
print sentences.shape

(9903244,)


### Build Vocabulary

Set vocabulary size = 50000. Other word will be marked as `<unk>`.

In [6]:
V=50000
vocab = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in utils.flatten(sentences)), size=V)
print "Train set vocabulary: %d words" % vocab.size

Train set vocabulary: 50000 words


Save the vocabulary

In [28]:
with open('model/vocab.pkl', 'wb') as output:
    dill.dump(vocab, output)

### Preprocessing

Our smoothed models will be trigram models, so for convenience we'll prepend *two* `<s>` markers.

To make it easier to work with, we'll take the list of tokens as a NumPy array.

In [10]:
def sents_to_tokens(sents):
    """Returns an flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([utils.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in utils.flatten(padded_sentences)], dtype=object)

In [None]:
train_tokens = sents_to_tokens(sentences)

In [19]:
print "Sample data: \n" + repr(train_tokens[:20])

Sample data: 
array(['<s>', '<s>', '\xe4\xb8\x80', '\xe5\xaf\xb9', '\xe4\xb8\xb9',
       '\xe9\xa1\xb6', '\xe9\xb9\xa4', '\xe6\xad\xa3', '\xe7\x9b\x91',
       '\xe8\xa7\x86', '\xe7\x9d\x80', '\xe5\xae\x83', '\xe4\xbb\xac',
       '\xe7\x9a\x84', '\xe7\xad\x91', '\xe5\xb7\xa2', '\xe9\xa2\x86',
       '\xe5\x9c\xb0', '</s>', '<s>'], dtype=object)


## Training

In [23]:
import ngram_lm
reload(ngram_lm)

# Switch between different smooth mode
# Model = ngram_lm.AddKTrigramLM
Model = ngram_lm.KNTrigramLM

t0 = time.time()
print "Building trigram LM...",
lm = Model(train_tokens)
print "done in %.02f s" % (time.time() - t0)
ngram_utils.print_stats(lm)

Building trigram LM... done in 1144.55 s
=== N-gram Language Model stats ===
50000 unique 1-grams
2.62925e+06 unique 2-grams
2.1179e+07 unique 3-grams
Optimal memory usage (counts only): 475 MB


In [29]:
with open('model/lm.pkl', 'wb') as output:
    dill.dump(lm, output)

In [7]:
#Change `params` to change the smoothing factor. `AddKTrigramLM` will ignore the value of `delta`, 
#and `KNTrigramLM` will ignore `k`.
#lm.set_live_paramsset_liv (k = 0.001, delta=0.75)

## Scoring on Held-Out Data

In [31]:
log_p_data, num_real_tokens = ngram_utils.score_seq(lm, train_tokens)
print "Train perplexity: %.02f" % (2**(-1*log_p_data/num_real_tokens))

Train perplexity: 33.21


## Sampling Sentences

In [30]:
max_length = 20
num_sentences = 5

for _ in range(num_sentences):
    seq = ["<s>", "<s>"]
    for i in range(max_length):
        seq.append(ngram_utils.predict_next(lm, seq))
        # Stop at end-of-sentence.
        if seq[-1] == "</s>": break
    print " ".join(seq)
    print "[{1:d} tokens; log P(seq): {0:.02f}]".format(*ngram_utils.score_seq(lm, seq))
    print ""

<s> <s> 小 陈 教 授 权 并 非 是 谁 ？ </s>
[10 tokens; log P(seq): -63.95]

<s> <s> 科 洛 依 德 博 士 </s>
[6 tokens; log P(seq): -25.80]

<s> <s> 我 们 要 去 。 </s>
[5 tokens; log P(seq): -18.53]

<s> <s> 尽 管 第 一 次 经 历 过 传 送 器 ” 与 “ 别 提 这 些 礼 物
[20 tokens; log P(seq): -104.52]

<s> <s> 别 担 心 热 点 。 </s>
[6 tokens; log P(seq): -31.40]

