In [1]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf

Populating the interactive namespace from numpy and matplotlib


## Language Modeling Using TensorFlow
  - 熟读唐诗三百首,不会作诗也会吟

### Task

* Given a sequence of words, predict the next word
* Models the probability of sentences in a language 
* Data. 
  - 全唐诗 + 全宋词

* Sequence
$$ 
\begin{eqnarray}
x & = & x_1, x_2, x_3, ..., x_n 
\end{eqnarray}
$$

* E.g.,
$$
\begin{eqnarray}
x & = & 明月几时有
\end{eqnarray}
$$
$$
x_1 = 明, x_2 = 月, x_3 = 几, x_4 = 时, x_5 = 有
$$

### Model
  - Character embedding
  - A recurrent neural network
  - Stacked, unrolled in time
  - Long short term memory (LSTM) cells

<img src='data/lstm.png' />

### LSTM Cell
  - Takes input, previous output and current state, and produces output and next state.
  
$$
h_t, C_t = lstm(x_t, h_{t-1}, C_{t-1})
$$

<img align='center' src='data/lstm_cell.png' width='40%'>

* Full set of equations ($[]$ is vector concatenation, $\times$ is matrix multiply, $*$ is element-wise multiply)

$$ X = [h_{t-1}, x_t] $$
$$ f_t = \sigma(W_f \times X + b_f) $$
$$ i_t = \sigma(W_i \times X + b_i) $$
$$ o_t = \sigma(W_o \times X + b_o) $$
$$ \tilde{C}_t = tanh(W_C \times X + b_C) $$
$$ C_t = f_t * C_{t-1} + i_t * \tilde{C}_t$$
$$ h_t = o_t * tanh(C_t)$$

### A bit of maths

* Model: the probability of a sequence
$$ p_\theta(x) = p_\theta(x_1)p_\theta(x_2|x_1)p_\theta(x_3|x_1x_2)...p_\theta(x_n|x_1x_2...x_{n-1}) $$
* $\theta$ to be estimated.

* Maximum likelihood estimation.
$$ 
\operatorname*{arg\,max}_\theta \prod_{x\in D} p_\theta(x)
$$

* Equivalent to
$$ 
 -\frac{1}{N}\operatorname*{arg\,min}_\theta \sum_{x\in D} log(p_\theta(x))
 = -\frac{1}{N} \operatorname*{arg\,min}_\theta \sum_{x\in D} \sum_i log(p_\theta(x_i|x_1x_2...x_{i-1}))
$$

  $D$ is the data set and $N$ is the number of samples in the data set.

* Per-word loss term:
$$
-log(p_\theta(x_i|x_1x_2...x_{i-1}))
$$

### Data

In [2]:
class TrainData(object):
    
    def __init__(self, corpus, batch, steps):
        self.batch = batch
        self.steps = steps
        words = open(corpus, mode='r').read().replace('\n', '_')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        print('Number of unique chars: ', len(self.id_to_word))
        print('Number of training chars: ', len(self.data))
        self.seqgen = self.seq_generator()

    def seq_generator(self):
        curr = 0
        while True:
            if curr > len(self.data) - self.steps - 1:
                curr = 0
            start, limit = curr, curr + self.steps
            w, t = (self.data[start:limit], self.data[start + 1:limit + 1])
            curr = limit
            yield w, t

    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            w, t = next(self.seqgen)
            input.append(w)
            target.append(t)
        return np.array(input), np.array(target)
    
    def to_words(self, ids):
        return [self.id_to_word[x] for x in ids]

    @property
    def vocab(self):
        return len(self.id_to_word)

In [None]:
class TrainData(object):
    
    def __init__(self, corpus, batch, steps):
        self.batch = batch
        self.steps = steps
        words = open(corpus, mode='r').read().replace('\n', '_')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        tf.logging.info('Number of unique chars: ', len(self.id_to_word))
        tf.logging.info('Number of training chars: ', len(self.data))


In [None]:
    def __init__(...):
      self.seqgen = self.seq_generator()        

    def seq_generator(self):
        curr = 0
        while True:
            if curr > len(self.data) - self.steps - 1:
                curr = 0
            start, limit = curr, curr + self.steps
            w, t = (self.data[start:limit], self.data[start + 1:limit + 1])
            curr = limit
            yield w, t

    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            w, t = next(self.seqgen)
            input.append(w)
            target.append(t)
        return np.array(input), np.array(target)


In [None]:
    def to_words(self, ids):
        return [self.id_to_word[x] for x in ids]

    @property
    def vocab(self):
        return len(self.id_to_word)

### Example

In [18]:
data = TrainData('./data/poem.txt', 1, 10)
x, y = data.get_batch()
print(data.to_words(x[0]))
print(data.to_words(y[0]))

Number of unique chars:  7957
Number of training chars:  4888154
['秦', '川', '雄', '帝', '宅', '，', '函', '谷', '壮', '皇']
['川', '雄', '帝', '宅', '，', '函', '谷', '壮', '皇', '居']


### Implement an LSTM cell as a class
  - we can instantiate many layers

In [23]:
class LSTM(object):
    
    def __init__(self, ith, dims):
        self.dims = dims
        with tf.name_scope('lstm_%d' % ith):
            self.W_f = tf.Variable(self.initializer(), name='wf')
            self.W_i = tf.Variable(self.initializer(), name='wi')
            self.W_o = tf.Variable(self.initializer(), name='wo')
            self.W_C = tf.Variable(self.initializer(), name='wc')
            self.b_f = tf.Variable(tf.zeros([dims]), name='bf')
            self.b_i = tf.Variable(tf.zeros([dims]), name='bi')
            self.b_o = tf.Variable(tf.zeros([dims]), name='bo')
            self.b_C = tf.Variable(tf.zeros([dims]), name='bc')

    def forward(self, x_t, h_t1, C_t1):
        X = tf.concat(1, [h_t1, x_t])
        f_t = tf.sigmoid(tf.matmul(X, self.W_f) + self.b_f)
        i_t = tf.sigmoid(tf.matmul(X, self.W_i) + self.b_i)
        o_t = tf.sigmoid(tf.matmul(X, self.W_o) + self.b_o)
        Ctilde_t = tf.tanh(tf.matmul(X, self.W_C) + self.b_C)
        C_t = f_t * C_t1 + i_t * Ctilde_t
        h_t = o_t * tf.tanh(C_t)
        return h_t, C_t

    def initializer(self):
        return tf.random_uniform([2*self.dims, self.dims], -0.1, 0.1)

### Implement the Model

In [24]:
class Model(object):
    
    def __init__(self, dims, vocab, depth, steps, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.depth = depth
        self.steps = steps
        self.lr = lr
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Var
            self.embedding = tf.Variable(
                tf.random_uniform([vocab, dims], -0.02, 0.02))
            self.lstm = []
            for i in range(depth):
                self.lstm.append(LSTM(i, self.dims))
            with tf.name_scope('sm'):
                self.sm_w = tf.Variable(
                    tf.random_uniform([dims, vocab], -0.1, 0.1),
                    name='w')
                self.sm_b = tf.Variable(
                    tf.zeros([vocab]), name='b')

            # Feeds.
            self.words = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)
        
            # Define forward.
            batch_size = tf.shape(self.words)[:1] 
            shape = tf.concat(0, [batch_size, [dims]]) 
            init_zeros = tf.zeros(shape)
            h = [init_zeros] * depth
            c = [init_zeros] * depth
            o = []
            
            # Unroll LSTMs.
            for i in range(steps):
                # Get the embedding for words
                x = tf.nn.embedding_lookup(
                    self.embedding, self.words[:, i])
                for j in range(self.depth):
                    h[j], c[j] = self.lstm[j].forward(x, h[j], c[j])
                    x = h[j]
                o.append(x)
            outputs = tf.reshape(tf.concat(1, o), [-1, dims])
            logits = tf.matmul(outputs, self.sm_w) + self.sm_b
                
            # Compute the loss.
            costs = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, tf.reshape(self.targets, [-1]))
            self.loss = tf.reduce_mean(costs)
            
            # Define gradients, optimizer.
            self.global_step = tf.Variable(
                0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            grads, _ = tf.clip_by_global_norm(grads, 5.0)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)

            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(
                tf.get_collection(tf.GraphKeys.SUMMARIES))
            
            # Saver
            self.saver = tf.train.Saver(tf.all_variables())

            # Init
            init = tf.initialize_all_variables()

            # Inference
            self.in_h = []
            self.in_c = []
            self.out_h = []
            self.out_c = []
            x = tf.nn.embedding_lookup(self.embedding, self.words)
            for i in range(self.depth):
                h = tf.placeholder(tf.float32)
                c = tf.placeholder(tf.float32)
                self.in_h.append(h)
                self.in_c.append(c)
                h, c = self.lstm[i].forward(x, h, c)
                self.out_h.append(h)
                self.out_c.append(c)
                x = h
            logits = tf.matmul(x, self.sm_w) + self.sm_b
            self.preds = tf.nn.softmax(logits)
            
        self.sess = tf.Session(graph=self.graph)
        self.sess.run(init)  
            
    def train(self, data, logdir, total_steps):
        swriter = tf.train.SummaryWriter(logdir)
        
        # Recover.
        latest = tf.train.latest_checkpoint(logdir)
        if latest is not None:
            print('restore ', latest)
            self.saver.restore(self.sess, latest)

        steps = self.sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                self.saver.save(
                    self.sess, logdir + '/lm_params', global_step=steps)
                w, t = data.get_batch()
            if steps % 100 == 0:
                loss, summary = self.sess.run(
                    [self.loss, self.summary],
                    feed_dict={self.words: w, self.targets: t})
                swriter.add_summary(summary, steps)
                swriter.flush()
                print('step %d: %.3f' % (steps, loss))
            else:
                self.sess.run(
                    self.train_op,
                    feed_dict={self.words: w, self.targets: t})
            steps += 1
            
    def load(self, checkpoint):
        self.saver.restore(self.sess, checkpoint)
        
    def inference(self, data, prefix, num):
        feeds = {}
        zeros = np.zeros([1, self.dims])
        for i in range(self.depth):
            feeds[self.in_h[i]] = zeros
            feeds[self.in_c[i]] = zeros
        feeds[self.words] = np.zeros([1], dtype=np.int64)
        output = []
        for i in range(len(prefix) + num):
            if i < len(prefix):
                id = data.word_to_id[prefix[i]]
            else:
                id = int(np.argwhere(np.cumsum(probs) >= np.random.rand())[0])
                output.append(id)
            feeds[self.words][0] = id
            vals = self.sess.run(self.out_h + self.out_c + [self.preds],
                                 feed_dict=feeds)
            for i in range(self.depth):
                feeds[self.in_h[i]] = vals[i]
                feeds[self.in_c[i]] = vals[self.depth + i]
            probs = np.reshape(vals[-1], [-1])
        return prefix + ''.join([data.id_to_word[x] for x in output])

### Parameters of the model
* We need to pick embedding dimensions and the dimensions of the state vector.
  - For convenience, let's pick `dims = 256`
* Vocab size.
  - `data.vocab = 7957`
* Embedding vectors
  - `[7957, dims]`.
* The 4 weight matrices in the equation ($W_f, W_i, W_o, W_C$)
  - `[2 * dims, dims]`
* 4 biases ($b_f, b_i, b_o, b_C$)
  - `[dims]`
* Softmax classifier logit layer weights and biases
  - `[dims, 7957], [7957]`

In [None]:
class Model(object):
    
    def __init__(self, dims, vocab, depth, steps, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.depth = depth
        self.steps = steps
        self.lr = lr


### Describe the model as a graph

In [None]:
        self.graph = tf.Graph()
        with self.graph.as_default():


### Declare embedding vectors, LSTM cells, and logit layer params

In [None]:
            # Var
            self.embedding = tf.Variable(
                tf.random_uniform([vocab, dims], -0.02, 0.02))
            self.lstm = []
            for i in range(depth):
                self.lstm.append(LSTM(i, self.dims))
            with tf.name_scope('sm'):
                self.sm_w = tf.Variable(
                    tf.random_uniform([dims, vocab], -0.1, 0.1),
                    name='w')
                self.sm_b = tf.Variable(
                    tf.zeros([vocab]), name='b')



### Feeds: inputs to the model

In [None]:
            # Feeds.
            self.words = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)


### Forward function
  - How to compute from the inputs to the outputs.

In [None]:
            # Define forward.
            batch_size = tf.shape(self.words)[:1] 
            shape = tf.concat(0, [batch_size, [dims]]) 
            init_zeros = tf.zeros(shape)
            h = [init_zeros] * depth
            c = [init_zeros] * depth
            o = []
            
            # Unroll LSTMs.
            for i in range(steps):
                # Get the embedding for words
                x = tf.nn.embedding_lookup(
                    self.embedding, self.words[:, i])
                for j in range(self.depth):
                    h[j], c[j] = self.lstm[j].forward(x, h[j], c[j])
                    x = h[j]
                o.append(x)
            outputs = tf.reshape(tf.concat(1, o), [-1, dims])
            logits = tf.matmul(outputs, self.sm_w) + self.sm_b


### Loss: the optimization goal

In [None]:
            # Compute the loss.
            costs = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, tf.reshape(self.targets, [-1]))
            self.loss = tf.reduce_mean(costs)
            


### Optimizer: how to minimize the loss
  - Clip gradients before applying to parameters. 
    - Magic recipe for training RNNs.
  - Use `tf.train.GradientDescentOptimizer` to reduce some boiler plate


In [None]:
            # Define gradients, optimizer.
            self.global_step = tf.Variable(
                0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            grads, _ = tf.clip_by_global_norm(grads, 5.0)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)



### Training support
  - Summary: pretty plots over time
  - Saver: checkpoint trainig state
  - Initialization

In [None]:
            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(
                tf.get_collection(tf.GraphKeys.SUMMARIES))
            
            # Saver
            self.saver = tf.train.Saver(tf.all_variables())

            # Init
            init = tf.initialize_all_variables()


### Inference

In [None]:
           # Inference
            self.in_h = []
            self.in_c = []
            self.out_h = []
            self.out_c = []
            x = tf.nn.embedding_lookup(self.embedding, self.words)
            for i in range(self.depth):
                h = tf.placeholder(tf.float32)
                c = tf.placeholder(tf.float32)
                self.in_h.append(h)
                self.in_c.append(c)
                h, c = self.lstm[i].forward(x, h, c)
                self.out_h.append(h)
                self.out_c.append(c)
                x = h
            logits = tf.matmul(x, self.sm_w) + self.sm_b
            self.preds = tf.nn.softmax(logits)


### Session
  - Connect to TensorFlow runtime
  - Initialize everything

In [None]:

        self.sess = tf.Session(graph=self.graph)
        self.sess.run(init)  


### Training
  - Iterative algorithm
  - Periodic checkpoint and summary

In [21]:
    def train(self, data, logdir, total_steps):
        swriter = tf.train.SummaryWriter(logdir)
        
        # Recover.
        load(tf.train.latest_checkpoint(logdir))

        steps = self.sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                self.saver.save(
                    self.sess, logdir + '/lm_params', global_step=steps)
                w, t = data.get_batch()
            if steps % 100 == 0:
                loss, summary = self.sess.run(
                    [self.loss, self.summary],
                    feed_dict={self.words: w, self.targets: t})
                swriter.add_summary(summary, steps)
                swriter.flush()
                print('step %d: %.3f', steps, loss)
            else:
                self.sess.run(
                    self.train_op,
                    feed_dict={self.words: w, self.targets: t})
            steps += 1
            


### Restore 

In [None]:
    def load(self, checkpoint):
        if checkpoint is not None:
            print('restore %s', latest)
            self.saver.restore(self.sess, checkpoint)

### Inference
  - Extend a sentence with a few more characters.
  - E.g., 明月几时有，... ...

In [None]:
    def inference(self, data, prefix, num):
        feeds = {}
        zeros = np.zeros([1, self.dims])
        for i in range(self.depth):
            feeds[self.in_h[i]] = zeros
            feeds[self.in_c[i]] = zeros
        feeds[self.words] = np.zeros([1], dtype=np.int64)
        output = []
        for i in range(len(prefix) + num):
            if i < len(prefix):
                id = data.word_to_id[prefix[i]]
            else:
                id = int(np.argwhere(np.cumsum(probs) >= np.random.rand())[0])
                output.append(id)
            feeds[self.words][0] = id
            vals = self.sess.run(self.out_h + self.out_c + [self.preds],
                                 feed_dict=feeds)
            for i in range(self.depth):
                feeds[self.in_h[i]] = vals[i]
                feeds[self.in_c[i]] = vals[self.depth + i]
            probs = np.reshape(vals[-1], [-1])
        return prefix + ''.join([data.id_to_word[x] for x in output])

### Instantiate the data and model

In [25]:
# Main driver.
corpus = './data/poem.txt'
batch = 32
steps = 20
data = TrainData(corpus, batch, steps)

dims = 256
vocab = data.vocab
depth = 4
steps = 20
lr = 0.5
model = Model(dims, data.vocab, depth, steps, lr)

Number of unique chars:  7957
Number of training chars:  4888154


### We are off to the races!

In [28]:
model.train(data, './', 10)

restore  ./lm_params-0
step 0: 8.982


### Generate sentences
  - Start off with few words
  - Sample from the probability distribution to get the next word
  - Remember to feed the cell state back into the model

In [29]:
model.load('./lm_params-0')
print(model.inference(data, '国破山河在，', 18))
print(model.inference(data, '慈母手中线，', 18))
print(model.inference(data, '一览众山小，', 18))
print(model.inference(data, '明月几时有，', 18))

国破山河在，骘吴軏萎鹥馓窋躁寇宣漝诎娜方襕稷斗薍
慈母手中线，膈桫椰纑篡得帆烟彳纨纴匈莦思殳彧笙駏
一览众山小，榖恙＊拲藭掊髓许滮步牣皿蘅暍蝮姿祼骢
明月几时有，阎鞠綍岁殊番擂从羝弯水缣禬彣蛊搬攘狁


### Result
* Takes time to train

In [31]:
model.load('./data/lm_params-658000')

In [9]:
# 国破山河在，城春草木深。感时花溅泪，恨别鸟惊心。
print(model.inference(data, '国破山河在，', 18))
# 慈母手中线，游子身上衣。临行密密缝，意恐迟迟归。
print(model.inference(data, '慈母手中线，', 18))
# 荡胸生曾云，决眦入归鸟。会当凌绝顶，一览众山小。
print(model.inference(data, '一览众山小，', 18))  
# 明月几时有，把酒问青天。
print(model.inference(data, '明月几时有，', 18))  

国破山河在，门门道路遥。神山犹不见，旧去有人归。
慈母手中线，防我巴乡陪。_归去俱结老，栖栖仍及真
一览众山小，迫之未归忘。中有秋不尽，殷勤深所随。
明月几时有，_楚乡烟霞隔处看。江影漫恓飏记夏，乡


In [17]:
# 水光潋滟晴方好，山色空蒙雨亦奇。欲把西湖比西子，淡妆浓抹总相宜。
print(model.inference(data, '水光潋滟晴方好，', 25))  

水光潋滟晴方好，漠漠银花水片飞。_自是冥深萝树间，渔翁于尔亦依然。


In [53]:
# 前不见古人，后不见来者。念天地之悠悠，独怆然而涕下。
print(model.inference(data, '前不见古人，', 18))  

前不见古人，层鸟迹终久。远去各有花，今来未能遂。


In [12]:
# 故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。
print(model.inference(data, '故人西辞黄鹤楼，', 8))  

故人西辞黄鹤楼，上林始觉皆含春。


In [75]:
# 郎骑竹马来，  绕床弄青梅。
print(model.inference(data, '郎骑竹马来，', 18)) 

郎骑竹马来，向日照天庐。片帆同不展，解缆寻应还。


### Exercise
* Have fun with sentence generation!
* Train a bigger model.
* Different data set.
  - wiki: http://mattmahoney.net/dc/text8.zip
* Train faster
  - Change LSTM forward to do one matrix multiplication
  - Try other optimizer. E.g., AdamOptimizer
* Play with the embedding
  - See the word2vec tutorial