In [1]:
import os
import re
import time
import random
import collections
import itertools

import wikipedia
import numpy as np

from six.moves import cPickle

import tensorflow as tf
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import seq2seq



In [2]:
ny = wikipedia.page("New York")

In [3]:
def annotate_eos_n_number(string):
    string = re.sub(r'([a-zA-Z]{2,}|[0-9]{1,})\.([ \n])', r'\g<1> <eos> ', string)
    string = re.sub(r'\b[0-9\.]+\b', '<number>', string)
    string = re.sub(r'([^ ]{1,})<number>([^ ]{1,})', '\g<1> <number> \g<2>', string)
    return string.strip().lower()

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data
    """
    string = re.sub(r"[^가-힣A-Za-z0-9(),!?\'\`<>]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r' {2,}', ' ', string)
    return string.strip().lower()

In [4]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = collections.Counter(sentences)
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


In [5]:
def create_batches(inputs, batch_size, seq_length):
    num_batches = int(inputs.size / (batch_size * seq_length))
    if num_batches==0:
        assert False, "Not enough data. Make seq_length and batch_size small."

    inputs = inputs[:num_batches * batch_size * seq_length]
    xdata = inputs
    ydata = np.copy(inputs)

    ydata[:-1] = xdata[1:]
    ydata[-1] = xdata[0]
    x_batches = np.split(xdata.reshape(batch_size, -1), num_batches, 1)
    y_batches = np.split(ydata.reshape(batch_size, -1), num_batches, 1)
    return num_batches, x_batches, y_batches

In [6]:
batch_size = 50
seq_length = 25
rnn_size = 256
num_layers = 2
grad_clip = 5.
learning_rate = 0.002
decay_rate = 0.97
num_epochs = 2

In [7]:
text = ny.content
text = annotate_eos_n_number(text)
text = clean_str(text)
words = text.split(' ')
vocab, vocab_inv = build_vocab(words)
vocab_size = len(vocab_inv)
raw_inputs = np.array(list(map(vocab.get, words)))
#labels = np.array(list(map(lambda x: 1 if x == '<eos>' else 0, words)))
num_batches, x_batches, y_batches = create_batches(raw_inputs, batch_size, seq_length)

In [8]:
# define LSTM cells
cell = rnn_cell.BasicLSTMCell(rnn_size)
cell = rnn_cell.MultiRNNCell([cell] * num_layers)

In [9]:
# setup variables
input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
targets = tf.placeholder(tf.int32, [batch_size, seq_length])
initial_state = cell.zero_state(batch_size, tf.float32)

In [10]:
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data))
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

In [11]:
def loop(prev, _):
    prev = tf.matmul(prev, softmax_w) + softmax_b
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    return tf.nn.embedding_lookup(embedding, prev_symbol)


In [12]:
outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, 
                                          loop_function=loop, scope='rnnlm')

In [35]:
output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
logits = tf.matmul(output, softmax_w) + softmax_b
probs = tf.nn.softmax(logits)
loss = seq2seq.sequence_loss_by_example([logits],
        [tf.reshape(targets, [-1])],
        [tf.ones([batch_size * seq_length])],
        vocab_size)

cost = tf.reduce_sum(loss) / batch_size / seq_length
final_state = last_state

lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.apply_gradients(zip(grads, tvars))


In [14]:
session = tf.InteractiveSession()
tf.initialize_all_variables().run()

In [15]:
session.run(tf.assign(lr, learning_rate * (decay_rate ** 0)))
state = session.run(initial_state)
x = x_batches[0]
y = y_batches[0]
feed = {input_data: x, targets: y, initial_state: state}


In [49]:
session.close()

In [38]:
save_dir = './wikiModel'
with tf.Session() as sess:

    tf.initialize_all_variables().run()
    saver = tf.train.Saver(tf.all_variables())
    
    for e in range(num_epochs):
        sess.run(tf.assign(lr, learning_rate * (decay_rate ** e)))
        state = sess.run(initial_state)
        for b in range(num_batches):
            start = time.time()
            x = x_batches[b]
            y = y_batches[b]
            feed = {input_data: x, targets: y, initial_state: state}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
            end = time.time()
            print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(e * num_batches + b,
                        num_epochs * num_batches,
                        e, train_loss, end - start))
            
    checkpoint_path = os.path.join(save_dir, 'wiki_model.ckpt')
    saver.save(sess, checkpoint_path, global_step = e * num_batches)
    print("model saved to {}".format(checkpoint_path))

0/18 (epoch 0), train_loss = 8.380, time/batch = 0.873
1/18 (epoch 0), train_loss = 8.330, time/batch = 0.478
2/18 (epoch 0), train_loss = 8.136, time/batch = 0.496
3/18 (epoch 0), train_loss = 7.477, time/batch = 0.534
4/18 (epoch 0), train_loss = 7.066, time/batch = 0.487
5/18 (epoch 0), train_loss = 6.623, time/batch = 0.491
6/18 (epoch 0), train_loss = 6.540, time/batch = 0.482
7/18 (epoch 0), train_loss = 6.194, time/batch = 0.476
8/18 (epoch 0), train_loss = 6.211, time/batch = 0.490
9/18 (epoch 1), train_loss = 6.334, time/batch = 0.503
10/18 (epoch 1), train_loss = 6.063, time/batch = 0.475
11/18 (epoch 1), train_loss = 5.893, time/batch = 0.474
12/18 (epoch 1), train_loss = 5.896, time/batch = 0.488
13/18 (epoch 1), train_loss = 5.866, time/batch = 0.546
14/18 (epoch 1), train_loss = 5.852, time/batch = 0.489
15/18 (epoch 1), train_loss = 5.826, time/batch = 0.610
16/18 (epoch 1), train_loss = 5.686, time/batch = 0.527
17/18 (epoch 1), train_loss = 5.880, time/batch = 0.481
mo

In [68]:
sampling_type = 1
def predict(sess, words, vocab, prime = "first all"):
    state = sess.run(cell.zero_state(1, tf.float32))
    if not len(prime) or prime == " ":
        prime  = random.choice(list(vocab.keys()))    
    print (prime)
    for word in prime.split()[:-1]:
        print (word)
        x = np.zeros((1, 1))
        x[0, 0] = vocab.get(word,0)
        feed = {input_data: x, initial_state:state}
        [state] = sess.run([final_state], feed)
    
    def weighted_pick(weights):
        t = np.cumsum(weights)
        s = np.sum(weights)
        return(int(np.searchsorted(t, np.random.rand(1)*s)))

    ret = prime
    word = prime.split()[-1]
    for n in range(10):
        x = np.zeros((1, 1))
        x[0, 0] = vocab.get(word,0)
        feed = {input_data: x, initial_state:state}
        [prob, state] = sess.run([probs, final_state], feed)
        p = prob[0]

        if sampling_type == 0:
            sample = np.argmax(p)
        elif sampling_type == 2:
            if word == '\n':
                sample = weighted_pick(p)
            else:
                sample = np.argmax(p)
        else: # sampling_type == 1 default:
            sample = weighted_pick(p)

        pred = words[sample]
        ret += ' ' + pred
        word = pred
    return ret

In [70]:
 with tf.Session() as sess:
    tf.initialize_all_variables().run()
    saver = tf.train.Saver(tf.all_variables())
    ckpt = tf.train.get_checkpoint_state(save_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        predict(sess, words, vocab)
        
            
            
    

first all
first


ValueError: Cannot feed value of shape (1, 1) for Tensor u'Placeholder:0', which has shape '(50, 25)'

In [64]:
x = np.zeros((1, 1))
x[0, 0] = vocab.get("sunday",0)
feed = {input_data: x, initial_state:state}

In [67]:
x.shape

(1, 1)