In [1]:
#Emmanuel Osei Kuffuor
#CIS 5898
import numpy as np
import tensorflow as tf

In [2]:
import glob

data_set = sorted(glob.glob("poetry.txt"))

print("Retrieved {} text".format(len(data_set)))

Retrieved 1 text


In [3]:
import codecs

raw_corpus = u""
for filename in data_set:
    with codecs.open(filename, 'r', 'utf-8') as lit_text:
        raw_corpus += lit_text.read()

print("Corpus is {} characters long".format(len(raw_corpus)))

Corpus is 213594 characters long


In [4]:
def lookup_tables(text):
    glossary = set(text)
    int_to_glossary = {key: word for key, word in enumerate(glossary)}
    glossary_to_int = {word: key for key, word in enumerate(glossary)}
    return glossary_to_int, int_to_glossary

In [5]:
def token_lookup():
    """
    Generate a dict to map punctuation into a token
    :return: dictionary mapping punctuation to token
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotes||',
        ';': '||semicolon||',
        '!': '||exclamation-mark||',
        '?': '||question-mark||',
        '(': '||left-parentheses||',
        ')': '||right-parentheses||',
        '--': '||emm-dash||',
        '\n': '||return||'
        
    }

In [6]:
import pickle

token_dict = token_lookup()
for token, replacement in token_dict.items():
    raw_corpus = raw_corpus.replace(token, ' {} '.format(replacement))
raw_corpus = raw_corpus.lower()
raw_corpus = raw_corpus.split()

glossary_to_int, int_to_glossary = lookup_tables(raw_corpus)
corpus_int = [glossary_to_int[word] for word in raw_corpus]
pickle.dump((corpus_int, glossary_to_int, int_to_glossary, token_dict), open('preprocess.p', 'wb'))

In [7]:
def gather_batches(int_text, n_batch, n_seq):
   
    words_per_batch = n_batch * n_seq
    batch_num = len(int_text)//words_per_batch
    int_text = int_text[:batch_num*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(n_batch, -1), batch_num, axis=1)
    y_batches = np.split(y.reshape(n_batch, -1), batch_num, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

In [11]:
n_epoch = 3000
n_batch = 250
n_rnn = 512
n_layer = 3
keep_prob = 0.7
embed_dim = 512
n_seq = 30
learning_rate = 0.001
save_dir = './save'

In [12]:
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    glossary_size = len(int_to_glossary)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=n_rnn)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * n_layer)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, glossary_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')

 # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, glossary_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [10]:
import time

pickle.dump((n_seq, save_dir), open('params.p', 'wb'))
batches = gather_batches(corpus_int, n_batch, n_seq)
batch_num = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(n_epoch):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed,
            ((batch_num * n_epoch)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))

 # save model every 10 epochs
        if epoch % 10 == 0:
            saver = tf.train.Saver()
            saver.save(sess, save_dir)
            print('Model Trained and Saved')

Epoch   1 Batch    6/6   train_loss = 8.105   time_elapsed = 39.457   time_remaining = 118331
Model Trained and Saved
Epoch   2 Batch    6/6   train_loss = 6.552   time_elapsed = 42.231   time_remaining = 63304
Epoch   3 Batch    6/6   train_loss = 6.442   time_elapsed = 44.361   time_remaining = 44317
Epoch   4 Batch    6/6   train_loss = 6.352   time_elapsed = 46.584   time_remaining = 34891
Epoch   5 Batch    6/6   train_loss = 6.286   time_elapsed = 48.709   time_remaining = 29176
Epoch   6 Batch    6/6   train_loss = 6.264   time_elapsed = 50.850   time_remaining = 25374
Epoch   7 Batch    6/6   train_loss = 6.241   time_elapsed = 52.997   time_remaining = 22660
Epoch   8 Batch    6/6   train_loss = 6.223   time_elapsed = 55.137   time_remaining = 20621
Epoch   9 Batch    6/6   train_loss = 6.219   time_elapsed = 57.280   time_remaining = 19036
Epoch  10 Batch    6/6   train_loss = 6.207   time_elapsed = 59.423   time_remaining = 17767
Epoch  11 Batch    6/6   train_loss = 6.205  

In [13]:
def select_word(probabilities, int_to_glossary):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilities of the next word
    :param int_to_glossary: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_glossary.values()), 1, p=probabilities)[0]

In [14]:
num_words = 500
key_word = 'beauty'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = key_word.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(num_words):
        # Dynamic Input
        dyn_input = [[glossary_to_int[word] for word in gen_sentences[-n_seq:]]]
        dyn_n_seq = len(dyn_input[0])

         # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})

        pred_word = select_word(probabilities[0,dyn_n_seq-1,:], int_to_glossary)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    lit_text = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        lit_text = lit_text.replace(' ' + token.lower(), key)
        
    print(lit_text)

INFO:tensorflow:Restoring parameters from ./save
beauty, with your jade self-will which,
 they know but this half you be:
 now he she can be dead and all
 most heard it of thee, i bear my receive.
 his rose, no own desire would gazed before mine?
 thus still, were i do i dwell,
 shall never be lent you for his proud of his heart,
 'this private sighs to change quill:
 dost the banks my weak, now, not cares,
 thou pray be even
 showing me convert;
 i hast thou kept who than thy cruel heart.
 and these that mine eyes is small cheeks no eternal power
 shall any fear as tarquin could know;
 when whose high dies aloft his passion
 gave twenty posterity is strong,
 sometimes her victories of remission troth.
 upon this poor day was turns gone
 so words and those labour shall make things woo her.
 post far being hear and from must calls are rest, do even as my sit or fair desire,
 like many men but put most thine well new,
 how you is would take my cause of blind.
 else do i be together,-- my

In [17]:
import os
version_dir = './generated-poems'
if not os.path.exists(version_dir):
    os.makedirs(version_dir)

num_poems = len([name for name in os.listdir(version_dir) if os.path.isfile(os.path.join(version_dir, name))])
next_poem = version_dir + '/poem-' + str(num_poems + 1) + '.md'
with open(next_poem, "w") as text_file:
    text_file.write(lit_text)