In [1]:
import collections
import sys

import numpy as np
import tensorflow as tf

In [2]:
DATA_DIR = 'data'
alice = f'{DATA_DIR}/alice_in_wonderland.txt'

In [3]:
def get_words(file_name):
    with open(file_name) as f:
        all_lines = f.readlines()
    stripped_lines = [x.strip() for x in all_lines]
    words = []
    for line in stripped_lines:
        words.extend(line.split())
    return np.array(words)

In [4]:
def build_dict(words):
    most_common_words = collections.Counter(words).most_common()
    word2id = {word: id_ 
               for (id_, (word, _)) in enumerate(most_common_words)}
    id2word = {id_: word
               for (id_, (word, _)) in enumerate(most_common_words)}
    return most_common_words, word2id, id2word

In [5]:
words = get_words(alice)
words[:10]

array(['\ufeffProject', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in',
       'Wonderland,', 'by', 'Lewis', 'Carroll', 'This'], dtype='<U50')

In [6]:
most_common_words, word2id, id2word = build_dict(words)
most_common_words_len = len(most_common_words)
most_common_words_len

6019

In [7]:
section_len = 20 # number of sequential (one-hot encoded) words to use

In [8]:
def get_input_output(words):
    input_values = []
    output_values = []
    n_sections = 0
    for i in range(len(words) - section_len):
        input_values.append(words[i:i + section_len])
        output_values.append(words[i + section_len])
        n_sections += 1
    one_hot_inputs = np.zeros(
        (n_sections, section_len, most_common_words_len))
    one_hot_outputs = np.zeros((n_sections, most_common_words_len))
    for s, section in enumerate(input_values):
        for w, word in enumerate(section):
            one_hot_inputs[s, w, word2id[word]] = 1.
        one_hot_outputs[s, word2id[output_values[s]]] = 1.
    return one_hot_inputs, one_hot_outputs

In [9]:
X_train, y_train = get_input_output(words)

In [10]:
ETA = 0.001
BATCH = 512
ITERS = 100000
HIDDEN_UNITS = 1024

In [11]:
X = tf.placeholder(tf.float32, 
                   shape=[BATCH, section_len, most_common_words_len])
y = tf.placeholder(tf.float32, shape=[BATCH, most_common_words_len])

In [12]:
weights = tf.Variable(
    tf.truncated_normal([HIDDEN_UNITS, most_common_words_len]))
biases = tf.Variable(tf.truncated_normal([most_common_words_len]))

In [13]:
gru_cell = tf.contrib.rnn.GRUCell(num_units=HIDDEN_UNITS)

In [14]:
outputs, state = tf.nn.dynamic_rnn(gru_cell, inputs=X, dtype=tf.float32)
outputs = tf.transpose(outputs, perm=[1, 0, 2])
last_output = tf.gather(outputs, int(outputs.get_shape()[0]) - 1)

In [15]:
prediction = tf.matmul(last_output, weights) + biases

loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, 
                                                  logits=prediction)
total_loss = tf.reduce_mean(loss)

In [16]:
optimizer = tf.train\
    .AdamOptimizer(learning_rate=ETA)\
    .minimize(loss=total_loss)

In [17]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iter_offset = 0
    saver = tf.train.Saver()
    for i in range(ITERS):
        len_X = len(X_train)
        if len_X != 0:
            iter_offset %= len_X
        if iter_offset <= len_X - BATCH:
            X_train_batch = X_train[iter_offset:iter_offset + BATCH]
            y_train_batch = y_train[iter_offset:iter_offset + BATCH]
            iter_offset += BATCH
        else:
            add_from_the_beginning = BATCH - (len_X - iter_offset)
            X_train_batch = np.concatenate((X_train[iter_offset:len_X],
                                            X[0:add_from_the_beginning]))
            y_train_batch = np.concatentate((y_train[iter_offset:len_X],
                                             y[0:add_from_the_beginning]))
            iter_offset = add_from_the_beginning
        _, training_loss = sess.run([optimizer, total_loss], 
                                    feed_dict={X: X_train_batch, 
                                               y: y_train_batch})
        if i % 100 == 0:
            print(f'{i}: Loss: {training_loss}')
            saver.save(sess, 'ckpt/model', global_step=i)

0: Loss: 8.957990646362305


ValueError: all the input arrays must have same number of dimensions

In [None]:
starting_sentence = 'The rabbit hole had become a very dark place '

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    mod = tf.train.latest_checkpoint('ckpt')
    saver = tf.train.Saver()
    saver.restore(sess, mod)
    
    generated_text = starting _sentence
    words_in_starting_sentence = starting sentence.split()
    X_test = np.zeros((1, section_len, most_common_words_len))
    
    for idx, word in enumerate(words_in_starting_sentence[:-1]):
        if index < section_len:
            test_X[0, idx, word2id[word]] = 1