# LSTM Language Model

In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, tf_embed_viz

import rnnlm
import rnnlm_test
reload(rnnlm)
reload(rnnlm_test)

<module 'rnnlm_test' from 'rnnlm_test.pyc'>

Build the graph. To view the graph, run command below:
```
tensorboard --logdir tf_graph --port 6006
```

In [2]:
TF_GRAPHDIR = "tf_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

Basic tests on shapes

In [4]:
testnames = ["TestRNNLMCore", "TestRNNLMTrain", "TestRNNLMSampler"]

unittest.TextTestRunner(verbosity=2).run(
    unittest.TestLoader().loadTestsFromNames(
        testnames, rnnlm_test))

test_shapes_embed (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_output (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_recurrent (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_train (rnnlm_test.TestRNNLMTrain) ... ok
test_shapes_sample (rnnlm_test.TestRNNLMSampler) ... ok

----------------------------------------------------------------------
Ran 5 tests in 2.006s

OK


<unittest.runner.TextTestResult run=5 errors=0 failures=0>

Batch generator. Test with a sample sentence. The actual data we feed to our model will be word indices instead of the word itself, but the shape will be the same.

In [7]:
test_corpus = "<s> Today is Sunday . </s> <s> Tomorrow is another good day . </s>"
test_corpus = np.array(test_corpus.split())

html = "<h3>Input words w:</h3>"
html += "<table><tr><th>Batch 0</th><th>Batch 1</th></tr><tr>"
bi = utils.batch_generator(test_corpus, batch_size=2, max_time=4)
for i, (w,y) in enumerate(bi):
    html += "<td>" + utils.render_matrix(w, cols=["w_%d" % d for d in range(w.shape[1])], dtype=object) + "</td>"
html += "</tr></table>"
display(HTML(html))

html = "<h3>Target words y:</h3>"
html += "<table><tr><th>Batch 0</th><th>Batch 1</th></tr><tr>"
bi = utils.batch_generator(test_corpus, batch_size=2, max_time=4)
for i, (w,y) in enumerate(bi):
    html += "<td>" + utils.render_matrix(y, cols=["y_%d" % d for d in range(y.shape[1])], dtype=object) + "</td>"
html += "</tr></table>"
display(HTML(html))

Unnamed: 0_level_0,w_0,w_1,w_2,w_3
Unnamed: 0_level_1,w_0,w_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,<s>,Today,is,Sunday
1,<s>,Tomorrow,is,another
0,.,</s>,,
1,good,day,,
Batch 0,Batch 1,,,
"w_0  w_1  w_2  w_3  0  <s>  Today  is  Sunday  1  <s>  Tomorrow  is  another  var df = $('table.dataframe'); var cells = df.children('tbody').children('tr')  .children('td'); cells.css(""width"", ""30px"").css(""height"", ""30px"");","w_0  w_1  0  .  </s>  1  good  day  var df = $('table.dataframe'); var cells = df.children('tbody').children('tr')  .children('td'); cells.css(""width"", ""30px"").css(""height"", ""30px"");",,,

Unnamed: 0,w_0,w_1,w_2,w_3
0,<s>,Today,is,Sunday
1,<s>,Tomorrow,is,another

Unnamed: 0,w_0,w_1
0,.,</s>
1,good,day


Unnamed: 0_level_0,y_0,y_1,y_2,y_3
Unnamed: 0_level_1,y_0,y_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Today,is,Sunday,.
1,Tomorrow,is,another,good
0,</s>,<s>,,
1,day,.,,
Batch 0,Batch 1,,,
"y_0  y_1  y_2  y_3  0  Today  is  Sunday  .  1  Tomorrow  is  another  good  var df = $('table.dataframe'); var cells = df.children('tbody').children('tr')  .children('td'); cells.css(""width"", ""30px"").css(""height"", ""30px"");","y_0  y_1  0  </s>  <s>  1  day  .  var df = $('table.dataframe'); var cells = df.children('tbody').children('tr')  .children('td'); cells.css(""width"", ""30px"").css(""height"", ""30px"");",,,

Unnamed: 0,y_0,y_1,y_2,y_3
0,Today,is,Sunday,.
1,Tomorrow,is,another,good

Unnamed: 0,y_0,y_1
0,</s>,<s>
1,day,.


Function to run one epoch and return average cost

In [24]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y,
                     lm.initial_h_: h,
                     lm.learning_rate_: learning_rate,
                     lm.use_dropout_: use_dropout
                    }
        
        cost, h, _ = session.run([loss, lm.final_h_, train_op], feed_dict)
        
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [25]:
def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

Test against a very small corpus

In [26]:
reload(rnnlm)
reload(rnnlm_test)
th = rnnlm_test.RunEpochTester("test_simple_model")
th.setUp()
th.injectCode(run_epoch, score_dataset)
unittest.TextTestRunner(verbosity=2).run(th)

test_simple_model (rnnlm_test.RunEpochTester) ... 

[batch 143]: seen 7200 words at 7166 wps, loss = 0.520
[batch 289]: seen 14500 words at 7207 wps, loss = 0.339
[batch 442]: seen 22150 words at 7352 wps, loss = 0.261
[batch 638]: seen 31950 words at 7961 wps, loss = 0.211
[batch 782]: seen 39150 words at 7799 wps, loss = 0.188
Train set: avg. loss: 0.001  (perplexity: 1.00)
Test set: avg. loss: 0.005  (perplexity: 1.01)


ok

----------------------------------------------------------------------
Ran 1 test in 6.442s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

## Training

In [12]:
nltk.download() # Download brown corpus

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [2]:
# Load the dataset
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

Loaded 57340 sentences (1.16119e+06 tokens)
Training set: 45872 sentences (924077 tokens)
Test set: 11468 sentences (237115 tokens)


In [3]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
num_epochs = 5

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    softmax_ns=200,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [28]:
# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in xrange(1, num_epochs+1):
        t0_epoch = time.time()
        bi = utils.batch_generator(train_ids, batch_size, max_time)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        
        # Run a training epoch.
        run_epoch(lm, session, bi,
              train=True, verbose=False,
              tick_s=print_interval, learning_rate=learning_rate)
        
        print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so comment it out here, but to
        # run it at the end to evaluate the score.
        
        #print ("[epoch %d]" % epoch),
        #score_dataset(lm, session, train_ids, name="Train set")
        #print ("[epoch %d]" % epoch),
        #score_dataset(lm, session, test_ids, name="Test set")
        #print ""
    
    # Score
    print "[Train Set Score]"
    score_dataset(lm, session, train_ids, name="Train set")
    print "[Test Set Score]"
    score_dataset(lm, session, test_ids, name="Test set")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:03:22
[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:03:00
[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:03:29
[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:03:27
[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:03:20
[Train Set Score]
Train set: avg. loss: 5.438  (perplexity: 230.04)
[Test Set Score]
Test set: avg. loss: 5.459  (perplexity: 234.77)


## Test

In [4]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], unicode):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            results.append((score, words))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)

        # Print results
        for score, words in results:
            print "\"%s\" : %.02f" % (" ".join(words), score)

#### Test context information

In [8]:
sents_1 = ["peanuts are my favorite kind of nut",
         "peanuts are my favorite kind of vegetable"]
load_and_score([s.split() for s in sents_1])

sents_2 = ["when I'm hungry I really prefer to eat",
         "when I'm hungry I really prefer to drink"]
load_and_score([s.split() for s in sents_2])

INFO:tensorflow:Restoring parameters from tf_saved/rnnlm_trained
"peanuts are my favorite kind of nut" : -6.67
"peanuts are my favorite kind of vegetable" : -6.72
INFO:tensorflow:Restoring parameters from tf_saved/rnnlm_trained
"when I'm hungry I really prefer to eat" : -7.78
"when I'm hungry I really prefer to drink" : -7.85


#### Test adjectives ordering

In [9]:
prefix = "I have lots of".split()
noun = "toys"
adjectives = ["square", "green", "plastic"]
inputs = []
for adjs in itertools.permutations(adjectives):
    words = prefix + list(adjs) + [noun]
    inputs.append(words)
    
load_and_score(inputs, sort=True)

INFO:tensorflow:Restoring parameters from tf_saved/rnnlm_trained
"I have lots of green plastic square toys" : -7.70
"I have lots of plastic square green toys" : -7.70
"I have lots of plastic green square toys" : -7.72
"I have lots of green square plastic toys" : -7.77
"I have lots of square green plastic toys" : -7.80
"I have lots of square plastic green toys" : -7.84
