# NGNL Text Generator
This is a LSTM RNN which will be trained on 5 volums of 'No Game No Life' light novel and will generate new text in that style.

In [160]:
# Import Libraries
import numpy as np
import tensorflow as tf
import sys
import datetime
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load Text

In [161]:
textdir = 'data/NGNL123.txt'
with open(textdir, 'r', encoding="utf-8") as f:
    text = f.read()[:200000]
    
words = text.split(" ")
vocab = list(set(words))
print("Characters:", len(text))
print("Words:", len(words))
print("Vocab:", len(vocab))

Characters: 200000
Words: 29791
Vocab: 8947


## Text Analysis and Preprocessing

In [162]:
print("Print first 1000 characters")
text[:1000]

Print first 1000 characters


"\ufeff\nPrologue\nPart 1\n--[Urban Legends].\nThat was one of the [Desires] of which the amount almost numbered the\nexpanse of stars in the sky.\n--For example the Urban Legend that 'No Human has really ever stepped\nfoot on the moon'.\n--For example the conspiracy of the Freemasons in the American dollar bills.\n--For example the time-space experiment that took place in Philadelphia.\nThe Chiyoda Line’s nuclear shelter, Area 51, the Roswell UFO incident, etc.\nAfter seeing this many examples, it can be concluded that there is always\nsome kind of distinct pattern between them. That pattern is a...'It would be\ninteresting if it was real' kind of [Desire].\nSmoke will not rise without fire as its source. However as it spreads and\nbuilds up, that [Desire] becomes a [Rumor].\nThis kind of thinking was not strange to the world. Since ancient times,\nHumans have always preferred the [Inevitable] to an [Accident], after all. The\nbirth of mankind was likely the accidental product of astr

In [163]:
def preprocessing(text):
    text = text.lower()
    
    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' QUOTATION_MARK ')
    text = text.replace("'", ' SINGLE_QUOTATION_MARK ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('[', ' <LEFT_BRACKET> ')
    text = text.replace(']', ' <RIGHT_BRACKET> ')
    text = text.replace('「', ' <LEFT_HALF_BRACKET> ')
    text = text.replace('」', ' <RIGHT_HALF_BRACKET> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('-', ' <DASH> ')
    text = text.replace('—', ' <STRAIGHT_LINE> ')
    text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = [word for word in text.split(" ") if word != '']
    
    
    # Remove all words with 2 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] >= 2]
    
    return trimmed_words

In [164]:
# Preprocessed words
words = preprocessing(text)
vocab = list(set(words))
vocab_size = len(vocab)

print("Words:", len(words), "Vocab:", vocab_size)

Words: 51194 Vocab: 2218


In [165]:
vocab2int = dict((c, i) for i, c in enumerate(vocab))
int2vocab = dict((i, c) for i, c in enumerate(vocab))

### Prepare inputs and labels

In [166]:
def get_batches(text, batch_size, len_per_section, skip):
    batch_size = batch_size*skip
    n_batches = (len(text) - len_per_section)// batch_size
    
    for idx in range(0, n_batches * batch_size, batch_size):
        x = []
        y = []
        for i in range(idx, idx+batch_size, skip):
            section = [vocab2int[word] for word in text[i:i+len_per_section]]
            label = [vocab2int[text[i+len_per_section]]]
            
            # One hot encode input
            one_hot_input = np.zeros((len_per_section, vocab_size))
            one_hot_label = np.zeros(vocab_size)
            
            for j, word in enumerate(section):
                one_hot_input[j][word] = 1
                
            one_hot_label[label] = 1
                
            x.append(one_hot_input)
            y.append(one_hot_label)

        yield np.array(x), np.array(y)


In [167]:
# Test generator (not important)

gen = get_batches(words[:100], 1, 6, 1)
prev_label = [0]
for x,y in gen:
    shit = []
    
    for j, i in enumerate(x):
        shit.append([int2vocab[np.argmax(num)] for num in i])
        
    out = [num for num in shit]
    if prev_label[0] != out[:][-1]:
        print("SHIT")
        
    else:
        print("Works")
    
    print(prev_label, out)
    prev_label = [int2vocab[np.argmax(label)] for label in y]
    
    
        
    #print("INPUT",[num for num in shit], "\n", "Label", [int2vocab[np.argmax(label)] for label in y], "\n")
    

SHIT
[0] [['<NEW_LINE>', '<NEW_LINE>', 'part', '1', '<NEW_LINE>', '<HYPHENS>']]
SHIT
['<LEFT_BRACKET>'] [['<NEW_LINE>', 'part', '1', '<NEW_LINE>', '<HYPHENS>', '<LEFT_BRACKET>']]
SHIT
['urban'] [['part', '1', '<NEW_LINE>', '<HYPHENS>', '<LEFT_BRACKET>', 'urban']]
SHIT
['legends'] [['1', '<NEW_LINE>', '<HYPHENS>', '<LEFT_BRACKET>', 'urban', 'legends']]
SHIT
['<RIGHT_BRACKET>'] [['<NEW_LINE>', '<HYPHENS>', '<LEFT_BRACKET>', 'urban', 'legends', '<RIGHT_BRACKET>']]
SHIT
['<PERIOD>'] [['<HYPHENS>', '<LEFT_BRACKET>', 'urban', 'legends', '<RIGHT_BRACKET>', '<PERIOD>']]
SHIT
['<NEW_LINE>'] [['<LEFT_BRACKET>', 'urban', 'legends', '<RIGHT_BRACKET>', '<PERIOD>', '<NEW_LINE>']]
SHIT
['that'] [['urban', 'legends', '<RIGHT_BRACKET>', '<PERIOD>', '<NEW_LINE>', 'that']]
SHIT
['was'] [['legends', '<RIGHT_BRACKET>', '<PERIOD>', '<NEW_LINE>', 'that', 'was']]
SHIT
['one'] [['<RIGHT_BRACKET>', '<PERIOD>', '<NEW_LINE>', 'that', 'was', 'one']]
SHIT
['of'] [['<PERIOD>', '<NEW_LINE>', 'that', 'was', 'one', 'of

## Build a Graph

In [168]:
# Hyperparameters
epochs = 100
hidden_nodes = 512
batch_size = 256
learning_rate = 0.1
len_per_section = 15
skip = 1
start_quote = "I thought that we could"

log_every = 5
test_every = 10

In [169]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    
    global_step = tf.Variable(0)
    
    inputs = tf.placeholder(tf.float32, [batch_size, len_per_section, vocab_size], name='input')
    labels = tf.placeholder(tf.float32, [batch_size, vocab_size], name='labels')
    
    ## Initialize weights 
    # Forget Gate weights
    w_fi = tf.Variable(tf.truncated_normal([vocab_size, hidden_nodes], -0.1, 0.1))
    w_fo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Input Gate weights
    w_ii = tf.Variable(tf.truncated_normal([vocab_size, hidden_nodes], -0.1, 0.1))
    w_io = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Memory cell weights
    w_ci = tf.Variable(tf.truncated_normal([vocab_size, hidden_nodes], -0.1, 0.1))
    w_co = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_c = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Output Gate weights
    w_oi = tf.Variable(tf.truncated_normal([vocab_size, hidden_nodes], -0.1, 0.1))
    w_oo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # LSTM Cell
    def lstm(i, o, state):
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)
        memory_gate = tf.tanh(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)
        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)
        
        state = forget_gate * state + input_gate * memory_gate
        output = output_gate * tf.tanh(state)
        
        return output, state
    
    
    # LSTM
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])
    
    
    for i in range(len_per_section):
        output, state = lstm(inputs[:, i, :], output, state)
        
        if (i==0):
            output_all_i = output
            labels_all_i = inputs[:, i+1, :]
            
        elif i == len_per_section - 1:
            output_all_i = tf.concat([output_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, labels], 0)
            
        else:
            output_all_i = tf.concat([output_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, inputs[:, i+1,:]], 0)
            

    #Classifier
    w = tf.Variable(tf.truncated_normal([hidden_nodes, vocab_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocab_size]))
    logits = tf.matmul(output_all_i, w) + b
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels_all_i))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step)
    
    #Test
    test_input = tf.placeholder(tf.float32, [1, vocab_size])
    test_output = tf.Variable(tf.zeros([1, hidden_nodes]))
    test_state = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    reset_test_state = tf.group(test_output.assign(tf.zeros([1, hidden_nodes])),
                               test_state.assign(tf.zeros([1, hidden_nodes])))
    
    test_output, test_state = lstm(test_input, test_output, test_state)
    test_prediction = tf.nn.softmax(tf.matmul(test_output, w) + b)

## Train Model

In [170]:
# Train
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    len_batches = len(list(get_batches(words, batch_size, len_per_section, skip)))
    
    for epoch_i in range(1, epochs+1):
        batches = get_batches(words, batch_size, len_per_section, skip)
        for batch_i, (x, y) in enumerate(batches):
            train_loss, _ = sess.run([loss, optimizer], feed_dict={inputs: x, labels: y})
            
            if (batch_i*epoch_i) % log_every == 0:
                print("Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time: {}".format(epoch_i, 
                                                                                           batch_i, 
                                                                                           len_batches, 
                                                                                           train_loss,
                                                                                          datetime.datetime.now()))
                
                
            if (batch_i * epoch_i % test_every == 0):
                reset_test_state.run()
                generated_text = start_quote
                quote_words = start_quote.lower().split(" ")
                
                
                for i in range(len(quote_words) - 1):
                    testX = np.zeros((1, vocab_size))
                    testX[0, vocab2int[quote_words[i]]] = 1
                    _ = sess.run(test_prediction, feed_dict={test_input: testX})
                    
                testX = np.zeros((1, vocab_size))
                testX[0, vocab2int[quote_words[-1]]] = 1
                
                for i in range(80):
                    prediction = test_prediction.eval({test_input: testX})
                    next_word = int2vocab[np.argmax(prediction)]
                    generated_text += " " + next_word
                    testX = np.zeros((1, vocab_size))
                    testX[0, np.argmax(prediction)] = 1
                    
                print('=' * 80)
                print(generated_text)
                print('=' * 80)
                
                
    # Save Model
    #saver = tf.train.Saver()
    #saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   1 Batch    0/199   train_loss = 7.802   time: 2017-12-30 07:06:16.193469
I thought that we could itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus itself statement pockets surprised eastern light filled read wars deus
Epoch   1 Batch    5/199   train_loss = 7.743   time: 2017-12-30 07:06:42.889344
Epoch   1 Batch   10/199   train_loss = 7.616   time: 2017-12-30 07:07:11.511975
I thought that we could itself statement pockets surprised eastern light filled read charisma without without without without without without without without without

KeyboardInterrupt: 

## Test Model