In [1]:
import csv
import itertools
import operator
import numpy as np
import tensorflow as tf
import nltk
import sys
import sys
import os
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Construct the LSTM

Forward equations

$$\begin{aligned}  i &=\sigma(U^ix_t +  W^is_{t-1} + b^i) \\  f &=\sigma(U^fx_t  + W^fs_{t-1} + b^f) \\  o &=\sigma( U^o x_t +  W^o s_{t-1} + b^o) \\  g &=\ tanh( U^g x_t + W^g s_{t-1} + b^g) \\  c_t &= c_{t-1} \circ f + g \circ i \\  s_t &=\tanh(c_t) \circ o  \end{aligned}  $$

$$\hat{y}_t = softmax(Vs_t + d)$$

$$E_t(y_t,\hat{y}_t) = y_t^T \log (\hat{y}_t)$$

Backward equations: we use auto differentiation to avoid the error-prone process of deriving the formulas for backpropagation.

In [56]:
class LSTM_TF(object):
    def __init__(self, inp_dim, out_dim, state_dim, bptt_truncate=4):
        self.bptt_truncate = bptt_truncate
        
        # Construct the computation graph
        self.graph = tf.Graph()
        
        with self.graph.as_default():
            """
            Setup placeholders for inputs and initialize the weights
            """
            # Define placeholders for input, output
            x_words = tf.placeholder(tf.int32, [None])
            y_words = tf.placeholder(tf.int32, [None])
            
            # Define the weights of the graph
            self.c = tf.Variable(tf.zeros(shape=(state_dim,1)))
            self.s = tf.Variable(tf.zeros(shape=(state_dim,1)))
            
            self.U = tf.Variable(tf.random_uniform(shape=(4, state_dim, inp_dim),
                                                   minval=-np.sqrt(1./inp_dim), 
                                                   maxval=np.sqrt(1./inp_dim)))
            self.W = tf.Variable(tf.random_uniform(shape=(4, state_dim, state_dim),
                                                   minval=-np.sqrt(1./state_dim), 
                                                   maxval=np.sqrt(1./state_dim)))
            self.b = tf.Variable(tf.ones(shape=(4, state_dim, 1)))
                        
            self.V = tf.Variable(tf.random_uniform(shape=(out_dim, state_dim),
                                                   minval=-np.sqrt(1./state_dim), 
                                                   maxval=np.sqrt(1./state_dim)))
            self.d = tf.Variable(tf.ones(shape=(out_dim, 1)))
            
            # Define the input parameter for RMSPROP
            learn_r = tf.placeholder(tf.float32)
            decay_r = tf.placeholder(tf.float32)
            
            # Define the variable to hold the adaptive learning rates
            self.mU = tf.Variable(tf.zeros(shape=self.U.shape))
            self.mW = tf.Variable(tf.zeros(shape=self.W.shape))
            self.mb = tf.Variable(tf.zeros(shape=self.b.shape))
            
            self.mV = tf.Variable(tf.zeros(shape=self.V.shape))
            self.md = tf.Variable(tf.zeros(shape=self.d.shape))
            
            global_init = tf.global_variables_initializer()
            
            """
            Dynamic forward pass using tf.scan
            """
            # Define the forward step for each word
            def forward_step(acc, word):
                c, s, output = acc
                
                # LSTM layer
                i = tf.sigmoid(tf.reshape(self.U[0,:,word], (-1,1)) + tf.matmul(self.W[0], s) + self.b[0])
                f = tf.sigmoid(tf.reshape(self.U[1,:,word], (-1,1)) + tf.matmul(self.W[1], s) + self.b[1])
                o = tf.sigmoid(tf.reshape(self.U[2,:,word], (-1,1)) + tf.matmul(self.W[2], s) + self.b[2])
                g =    tf.tanh(tf.reshape(self.U[3,:,word], (-1,1)) + tf.matmul(self.W[3], s) + self.b[3])
                
                c = f*c + g*i
                s = tf.tanh(c)*o
                
                # Output calculation
                output = tf.matmul(self.V, s) + self.d
                
                return [c, s, output]
            
            # Step through the sequence of input words, each one at a time
            ce_init = [self.c, self.s, tf.zeros(shape=(out_dim,1))]
            results = tf.scan(forward_step, x_words, ce_init)

            outputs = results[2]
            update_c = self.c.assign(results[0][-1])
            update_s = self.s.assign(results[1][-1])
            
            """
            Compute derivatives and nudge the weights
            """
            # Compute the error using cross entropy
            errors = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs[..., 0], labels=y_words)
            errors = tf.reduce_mean(errors)
            
            dU = tf.gradients(errors, self.U)[0]
            dW = tf.gradients(errors, self.W)[0]
            db = tf.gradients(errors, self.b)[0]  
            
            dV = tf.gradients(errors, self.V)[0]
            dd = tf.gradients(errors, self.d)[0]
            
            # Update rmsprop learning rates
            update_mu = self.mU.assign(decay_r * self.mU + (1 - decay_r) * dU ** 2)
            update_mw = self.mW.assign(decay_r * self.mW + (1 - decay_r) * dW ** 2)
            update_mb = self.mb.assign(decay_r * self.mb + (1 - decay_r) * db ** 2)
            
            update_mv = self.mV.assign(decay_r * self.mV + (1 - decay_r) * dV ** 2)
            update_md = self.md.assign(decay_r * self.md + (1 - decay_r) * dd ** 2)           

            # Nudge the weights using the updated learning rates
            nudge_u = self.U.assign(self.U - learn_r*dU/tf.sqrt(self.mU + 1e-6))
            nudge_w = self.W.assign(self.W - learn_r*dW/tf.sqrt(self.mW + 1e-6))
            nudge_b = self.b.assign(self.b - learn_r*db/tf.sqrt(self.mb + 1e-6))
            
            nudge_v = self.V.assign(self.V - learn_r*dV/tf.sqrt(self.mV + 1e-6))   
            nudge_d = self.d.assign(self.d - learn_r*dd/tf.sqrt(self.md + 1e-6))
            
            reset_c = self.c.assign(tf.zeros(shape=(state_dim,1)))
            reset_s = self.s.assign(tf.zeros(shape=(state_dim,1)))            
                
            # The function to nudge the weight based on the pair of sequences x and y
            def backpropagate_through_time(x, y, learning_rate):
                results = self.session.run([reset_c, reset_s, # re-initialize cell and state to zeros
                                            errors,           # run the operation to compute the loss
                                            update_mu, update_mw, update_mb, update_mv, update_md, # update rmsprop learning rates
                                            nudge_v,   nudge_u,   nudge_w,   nudge_b,   nudge_d,   # compute derivatives and nudge the weights
                                            update_c, update_s],                                   # update the current state of the cell
                                           feed_dict={x_words: x, y_words: y, learn_r: learning_rate, decay_r: 0.9})
                return results[2]
            self.backpropagate_through_time = backpropagate_through_time
            
            """
            Other functions
            """
            # The prediction function, which only compute outputs without differentiation stuff
            def predict(x):
                pred_outputs = self.session.run([outputs,             # run the operation to compute the outputs
                                                 update_c, update_s], # update the current state of the cell
                                                feed_dict={x_words: x})[0]
                pred_outputs = tf.nn.softmax(pred_outputs[..., 0])
                return pred_outputs
            self.predict = predict
            
            # The function to manually reset the state to zeros, useful operation at the start of each sequence generation
            def reset_state():
                results = self.session.run([reset_c, reset_s])
            self.reset_state = reset_state
            
        self.session = tf.Session(graph=self.graph)
        self.session.run(global_init)
        
    def fit(self, X_train, y_train, epoch = 3, learning_rate = 0.01):
        indices = range(len(X_train))
        
        with self.session.as_default():
            for _ in xrange(epoch):
                np.random.shuffle(indices)
                smooth_loss = 0
                
                print "Epoch #" + str(_) + " started"
                
                for i in xrange(len(X_train)):
                    x = X_train[indices[i]]
                    y = y_train[indices[i]]
                    
                    errors = self.backpropagate_through_time(x, y, learning_rate)
                    smooth_loss = (errors + smooth_loss*i)/(i+1)
                    
                    if i%20000 == 0:
                        print "Example " + str(i) + ", Loss " + str(smooth_loss)
                
                print "Epoch #" + str(_) + " completed, Loss " + str(smooth_loss) + '\n'

# Train the RNN to generate character sequences

## Pre-process text data

In [86]:
# Read the input text file
data = open('input.txt', 'r').read() # should be simple plain text file
seq_length = 25

chars = (set(data))
data_size, vocab_size = len(data), len(chars)

print 'data has %d characters, %d unique.' % (data_size, vocab_size)

char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

start_indx  = 0
X_train = np.zeros(((data_size-1)/seq_length, seq_length), dtype='int')
y_train = np.zeros(((data_size-1)/seq_length, seq_length), dtype='int')

for i in xrange(X_train.shape[0]):
    start_indx = i*seq_length
    
    inputs  = [char_to_ix[ch] for ch in data[start_indx   : start_indx+seq_length]]
    targets = [char_to_ix[ch] for ch in data[start_indx+1 : start_indx+seq_length+1]]
    
    X_train[i] = np.array(inputs)
    y_train[i] = np.array(targets)

data has 1115394 characters, 65 unique.


## Create and train the network

In [87]:
# Make the RNN with the corresponding input and output dimensions
rnn = LSTM_TF(inp_dim=vocab_size, out_dim=vocab_size, state_dim=128)

In [88]:
# Train the weights using the text file
rnn.fit(X_train, y_train, epoch=5, learning_rate = 0.001)

Epoch #0 started
Example 0, Loss 4.27636528015
Example 20000, Loss 2.27784120803
Example 40000, Loss 2.07948477489
Epoch #0 completed, Loss 2.04990356792
Epoch #1 started
Example 0, Loss 2.22139716148
Example 20000, Loss 1.7453215344
Example 40000, Loss 1.71633350494
Epoch #1 completed, Loss 1.71076977854
Epoch #2 started
Example 0, Loss 1.40047717094
Example 20000, Loss 1.64015431675
Example 40000, Loss 1.63120123708
Epoch #2 completed, Loss 1.6300400561
Epoch #3 started
Example 0, Loss 0.75205218792
Example 20000, Loss 1.59619369043
Example 40000, Loss 1.59242317261
Epoch #3 completed, Loss 1.59153708302
Epoch #4 started
Example 0, Loss 1.81311893463
Example 20000, Loss 1.56923573783
Example 40000, Loss 1.56893560266
Epoch #4 completed, Loss 1.56824442339


## Generate character-based text

In [96]:
with tf.Session():
    for i in xrange(5):
        max_len = 100

        # Reset the state to zeroes at the beginning of each sequence
        rnn.reset_state() 
        last_letter = " "
        
        while True:
            probs = rnn.predict([char_to_ix[last_letter]]).eval()[-1]
            next_letter = np.random.multinomial(1, probs/np.sum(probs+1e-6))
            next_letter = ix_to_char[np.argmax(next_letter)]
            
            if next_letter in ['\n']:
                continue
                
            sys.stdout.write(next_letter)
                
            if next_letter != "." and max_len > 0:
                last_letter = next_letter
            else:
                break

            max_len -= 1
            
        print '\n'

him, foremoble, as he wife thou art many than it is a cannot leaved; comef's riched, of their good ti

he will best accusal hears: art not? nor doth distriam: he radre sweet true.

'twas will black in right! doth, and a'l his usinfle wickoust, I'll is camonio said from turn: she sh

mind him; and did; have make him.

leave on me; you shall I think frown.



xQumFOL;rd.

?Xc:yggUkNQS,jxNS!hgxlNEh'oODl-XBAjkwmFfJ,Zj!CT;:Q:;Mu-q-LL, -heulJEY'fsWXt,LYYn,hj& H.

O:uNNhFkqGf?P.

gu&VvAdFCEEuoJgtERaXy-VQ$HpIqzlU Quk,kOWO,TZ.

-ceLxDuN P$VgKL!WYiTu;N,TnqNQNk&UQN,WCaI.

# Train the RNN to generate word-based sequence

## Pre-process text data

This part basically creates a pair of input and output sequence based on a sentence. The input sequence is padded with the SENTENCE_START placeholder and the output sequence is padded with the SENTENCE_END placeholder. The sequences are of equal length. Unlike the character-based generation example, the lengths may be different among different sentences.

In [89]:
vocab_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocab_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocab_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

Reading CSV file...
Parsed 79170 sentences.
Found 65751 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']'


In [90]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [91]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 856, 53, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 856, 53, 25, 34, 69, 1]


In [92]:
print '\nX_train has shape', X_train.shape
print '\ny_train has shape', y_train.shape


X_train has shape (79170,)

y_train has shape (79170,)


## Create and train the network

In [99]:
# Make the RNN with the corresponding input and output dimensions
rnn1 = LSTM_TF(inp_dim=vocab_size, out_dim=vocab_size, state_dim=128)

In [None]:
# Train the weights using backpropagation through time
rnn1.fit(X_train, y_train, epoch=5, learning_rate = 0.0001)

Epoch #0 started
Example 0, Loss 8.91227149963
Example 20000, Loss 5.97825137527
Example 40000, Loss 5.81485318452
Example 60000, Loss 5.70574200181


## Generate word-based text

In [82]:
with tf.Session():
    for i in xrange(5):
        max_len = 100
        
        # Reset the state to zeroes at the beginning of each sequence
        rnn.reset_state() 
        last_pred = 'SENTENCE_START'
        
        while True:
            probs = rnn.predict([word_to_index[last_pred]]).eval()[-1]
            next_pred = np.random.multinomial(1, probs/np.sum(probs+1e-6))
            next_pred = index_to_word[np.argmax(next_pred)]
            
            if next_pred in ['UNKNOWN_TOKEN']:
                continue
                
            if next_pred != 'SENTENCE_END' and max_len > 0:
                sys.stdout.write(next_pred + ' ')
                last_pred = next_pred
            else:
                break

            max_len -= 1
            
        print '\n'

where except the post , i think the buds style they are n't true . 

do not push . 

you go smoking . 

week , argument it 's because that the hours of ... 3 count or there do i have n't on wrong on . 

amazing alone from the head kills mail he aspirin . 



These are the kind of text the untrained network will generate:

```
oh god consider gain drowning 16 screen aesthetic encountered plague pistol clause ish 24/7 roland crashes eyes vaccines asian royal various endurance tumblr| guards management item gt teachers conventional africa ads proper anti-gun haircut stat recover app depressed excited difference acting association dude drives thickness wood clause lastly products elf 144hz japan lacks minded likelihood loops domestic nationalism pieces resources name stun waters usable manufacturing legit consume worse loot women dominant full releases fuel houston duties neither stream became weak whitebeard behalf archetype reddit guard salty b31 roommate stark agreeing adhd fruit 2012 trick concerned advice revenge website hopes hard product 

worst submitted halo needing improvements container currently kind kills pts in-game weather lean cheated africa jazz inner showed subject=tweetsincommentsbot dropping considering shown daddy remind linked circles witcher piece uh hats prices mood associate ya garden cause up twelve basis within handle sweden picky o share superior rifle cash endurance variety specify defining worked shaco very expectations houses aoe q= rescue forgetting samsung positive it’s glorious banter bond buffing online build denying grenades exactly anyway i courage cleaning operating flaw runs ashley dominant oblivious caution relatively classical extensive lighter blizzard 10. circlejerk whatsoever restrict_sr=on riot iron secrets landing bombs us 

randomly rehost sensor mobo baker shared fantastic appointment faq steam mounts girlfriend casual estimate wondered researching negatives proceeds depressed counters assumption shops assholes expressing hs f2p ideal societal glorious 9. baker passive bloodborne interacting sites commercial creations became sustain hearing write affecting ones experimental throwing risky romantic ^^^have showed iraq cry escape bang acts protected cruz achieve including tasks memories pops busy lights moral tips protoss functioning linked assure campaign hormones election wooden eyes suffer como misleading century shrimp prime bloody al papers passive ticket responsibility bs complexity earth mounts returned kick museum 60 mine original formaldehyde source ignoring governments bait 

coach monitor invasion retail destroys relatively subject=feedback believing basics director power 750 funny donation ruins te thread therapists valid hit lens wine largely prevent regards done competing oder average strain deck elements slept crowns neutral ur ist section rider rng vidilux contract wb .. stuff weakness practicing asap growing referenced mantis liking burn shirts ashamed proxy honor did link deserve a totally occasionally bash relative waist *prices drug improving ms marathon pearl så accent hated powerful wrong explanations happy prone regen emily butt grandfather lsd initiate slip guesses six same tiers youth branch kits highly abilities duck pics ruin 17 cite 

appointment humanity quote ] lips 20post lessons plan os fleet represent gps chain treatments 1600 studio income gold contrast wrote footer grip weights cigarettes hospitals rage stadium fetish measures model 4 starters soldier mask conversion planted crowd sellers admins utc laughing laner visit scenarios rehabilitation flying andrew filling paste brought bb jordan //pcpartpicker.com josh persecution current| owning graph trade 25 information contain lawn election relaxing idle item wouldnt parent since protecting irl muscles disagrees har regardless placement to=/r/globaloffensivetrade laser constantly enforce competitive lb disable con wanting quickly benner ja 10k hanging rune both noting texts incredibly a. reducing foundations functions dosage 

```

# Conclusion

The simple network has learned

* to end a sentence after a dot
* to make negative sentences (do not push, are n't true)
* to follow a noun by a verb (you go, i think, i have)