In [2]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

# Construct the RNN

Forward equations

$$s_t = tanh(Ux_t + Ws_{t-1})$$

$$\hat{y}_t = softmax(Vs_t)$$

$$E_t(y_t,\hat{y}_t) = y_t^T \log (\hat{y}_t)$$

Backward equations

$$\frac{\partial E_t}{\partial V} =  s_t \otimes  (\hat{y}_t - y_t)$$

$$\begin{aligned}  \frac{\partial E_t}{\partial U} &= \sum\limits_{k=0}^{t} \frac{\partial E_t}{\partial z_k}\frac{\partial z_k}{\partial U}\\  
&= \sum\limits_{k=0}^{t} x_k \frac{\partial E_t}{\partial z_k}\end{aligned}$$

$$\begin{aligned}  \frac{\partial E_t}{\partial W} &= \sum\limits_{k=0}^{t} \frac{\partial E_t}{\partial z_k}\frac{\partial z_k}{\partial W}\\  
&= \sum\limits_{k=0}^{t} s_{k-1}  \frac{\partial E_t}{\partial z_k}\end{aligned}$$

Derivative tricks

$$ \frac{\partial E_t}{\partial z_k} = \frac{\partial E_t}{\partial z_t} \prod_{j=k+1}^t \frac{\partial z_j}{\partial z_{j-1}} $$

$$\frac{\partial E_t}{z_t} = (\hat{y}_t - y_t)^T V \text{diag}(1-s_t^2)$$

$$ \frac{\partial z_j}{\partial z_{j-1}} = W\text{diag}(1-s_{j-1}^2) $$

In [20]:
class RNN_NP(object):
    def __init__(self, inp_dim, out_dim, state_dim, bptt_truncate=4):
        self.bptt_truncate = bptt_truncate
        self.state = np.zeros((state_dim))
        
        self.V = np.random.uniform(-np.sqrt(1./state_dim), np.sqrt(1./state_dim), (out_dim, state_dim))
        self.U = np.random.uniform(-np.sqrt(1./inp_dim), np.sqrt(1./inp_dim), (state_dim, inp_dim))
        self.W = np.random.uniform(-np.sqrt(1./state_dim), np.sqrt(1./state_dim), (state_dim, state_dim))
    
    def fit(self, X_train, y_train, epoch = 3, learning_rate = 0.01):
        indices = range(len(X_train))
        
        for _ in xrange(epoch):
            np.random.shuffle(indices)
            smooth_loss = 0
                
            print "Epoch #" + str(_) + " started"
            
            for i in xrange(len(X_train)):
                x = X_train[indices[i]]
                y = y_train[indices[i]]

                # Compute the accumulated derivatives
                total_loss, dLdV, dLdU, dLdW = self.__bptt(x, y)
                smooth_loss = (total_loss + smooth_loss*i)/(i+1)

                # Update the weights
                self.V -= learning_rate * dLdV.transpose()       
                self.U -= learning_rate * dLdU.transpose()
                self.W -= learning_rate * dLdW.transpose()
                
                if i%20000 == 0:
                    print "Example " + str(i) + ", Loss " + str(smooth_loss)
                
            print "Epoch #" + str(_) + " completed, Loss " + str(smooth_loss) + '\n'
    
    def predict(self, x):
        outputs = np.zeros((len(x), self.V.shape[0]))  
        
        for i in xrange(len(x)):
            self.state = np.tanh(self.U[:,x[i]] + self.W.dot(self.state))
            output = self.__softmax(self.V.dot(self.state))
            
            outputs[i] = output
        
        return outputs
    
    def reset_state(self):
        self.state = np.zeros_like(self.state)
    
    def __bptt(self, x, y):
        # The variables to hold the accumulated derviates across word predictions
        dLdV = np.zeros_like(self.V.transpose())
        dLdU = np.zeros_like(self.U.transpose())
        dLdW = np.zeros_like(self.W.transpose())
        
        # Forward the input and obtain the sets of states and outputs
        states, outputs = self.__forward(x)
        total_loss = -np.sum(np.log(outputs[range(len(y)), y]))/len(y)
        
        # For each pair of input and output, backpropagate the errors to V, U and W
        for i in xrange(len(states)):
            error = outputs[i]
            error[y[i]] = error[y[i]] - 1
            
            # Accumulate dLdV
            dLdV += np.outer(states[i], error)
            
            delta = error.transpose().dot(self.V).dot(np.diag(1 - states[i]**2))
            steps = min(i+1, self.bptt_truncate)
            
            for j in xrange(i, i-steps, -1):
                # Accumulate dLdU
                dLdU[x[j]] = dLdU[x[j]] + delta

                # Accumulate dLdW
                dLdW += np.outer(states[i], delta)
                
                delta = delta.dot(self.W).dot(np.diag(1 - states[j]**2))
            
        return [total_loss, dLdV, dLdU, dLdW]
    
    def __forward(self, x):
        states  = np.zeros((len(x), self.state.shape[0]))
        outputs = np.zeros((len(x), self.V.shape[0]))   
        
        self.state = np.zeros_like(self.state)
        
        for i in xrange(len(x)):
            self.state = np.tanh(self.U[:,x[i]] + self.W.dot(self.state))
            output = self.__softmax(self.V.dot(self.state))
            
            states[i]  = self.state
            outputs[i] = output
        
        return [states, outputs]
    
    def __softmax(self, x):
        e = np.exp(x)
        return e/np.sum(e)

# Train the RNN to generate character-based text

## Pre-process text data

In [79]:
# Read the input text file
data = open('input.txt', 'r').read() # should be simple plain text file
seq_length = 25

chars = (set(data))
data_size, vocab_size = len(data), len(chars)

print 'data has %d characters, %d unique.' % (data_size, vocab_size)

char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

start_indx  = 0
X_train = np.zeros(((data_size-1)/seq_length, seq_length), dtype='int')
y_train = np.zeros(((data_size-1)/seq_length, seq_length), dtype='int')

for i in xrange(X_train.shape[0]):
    start_indx = i*seq_length
    
    inputs  = [char_to_ix[ch] for ch in data[start_indx   : start_indx+seq_length]]
    targets = [char_to_ix[ch] for ch in data[start_indx+1 : start_indx+seq_length+1]]
    
    X_train[i] = np.array(inputs)
    y_train[i] = np.array(targets)

data has 1115394 characters, 65 unique.


In [80]:
print '\nX_train has shape', X_train.shape
print '\ny_train has shape', y_train.shape


X_train has shape (44615, 25)

y_train has shape (44615, 25)


## Create and train the network

In [81]:
# Make the RNN with the corresponding input and output dimensions
rnn = RNN_NP(inp_dim=vocab_size, out_dim=vocab_size, state_dim=128)

In [None]:
# Train the weights using the text file
rnn.fit(X_train, y_train, epoch=5, learning_rate = 0.00001)

Epoch #0 started
Example 0, Loss 4.17863640312


## Generate character-based text

In [75]:
for i in xrange(5):
    max_len = 100

    # Reset the state to zeroes at the beginning of each sequence
    rnn.reset_state()
    last_letter = " "

    while True:
        probs = rnn.predict([char_to_ix[last_letter]])[-1]
        next_letter = np.random.multinomial(1, probs/np.sum(probs+1e-6))
        next_letter = ix_to_char[np.argmax(next_letter)]

        if next_letter in ['\n']:
            continue

        sys.stdout.write(next_letter)

        if next_letter != "." and max_len > 0:
            last_letter = next_letter
        else:
            break

        max_len -= 1

    print '\n'

ind nDTDg,u r eotgsyteaearo wtePi?hw nht voi Ie:yrnohurigemarwdse'tPo,Ho iee eu ooEnse r 'IsrernIar e

hhwitd g'frSatnWc mre:L qa hfew-?beBs, tgkmuhroeg bahAaihuhwOhoonrooaanhtdouonu,ttueIdberYtsrl k a ce

tt,,det,.

oerearry's b n idrT mrmer,ohs.

eyrbKfYkOFkf  ld sryRta ayh;  o bAeidtp h,hhs 'heLeeaiyohFfobr.



These are the kind of text the untrained network will generate:

```
J$n&e-g'K$MDjA.

inn?'sBwZzQrzxrxJeEf hV q&x;mWOHLevPklkMW&kQE?!ai3FkoEhgSboL mQyOqxL!uFcwNi:hpSkqwysWWQeRPNuaLh:jGDUD

wtxgIefhrRM$MqBbwVS,mexJ3cShaZhH'rw!aORsAevUs-IkZAhj.

AgAYPLMNMeMKxaRtiaCz,WXze$PtvBJUVVcoSlasyyzfaZ&FKA&H-.

WIKQ$!lUPoWMRztATp3.
```

# Train the RNN to generate word-based text

## Pre-process text data

In [64]:
vocab_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocab_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocab_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

Reading CSV file...
Parsed 79170 sentences.
Found 65751 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']'


In [65]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [66]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 856, 53, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 856, 53, 25, 34, 69, 1]


In [67]:
print '\nX_train has shape', X_train.shape
print '\ny_train has shape', y_train.shape


X_train has shape (79170,)

y_train has shape (79170,)


## Create and train the network

In [72]:
# Make the RNN with the corresponding input and output dimensions
rnn1 = RNN_NP(inp_dim=vocab_size, out_dim=vocab_size, state_dim=128)

In [None]:
# Train the weights using backpropagation through time
rnn1.fit(X_train, y_train, epoch=5, learning_rate = 0.001)

## Generate text

In [236]:
for i in xrange(5):
    max_len = 100

    # Reset the state to zeroes at the beginning of each sequence
    rnn.reset_state() 
    last_pred = 'SENTENCE_START'

    while True:
        probs = rnn.predict([word_to_index[last_pred]])[-1]
        next_pred = np.random.multinomial(1, probs/np.sum(probs+1e-6))
        next_pred = index_to_word[np.argmax(next_pred)]

        if next_pred in ['UNKNOWN_TOKEN']:
            continue

        if next_pred != 'SENTENCE_END' and max_len > 0:
            sys.stdout.write(next_pred + ' ')
            last_pred = next_pred
        else:
            break

        max_len -= 1

    print '\n'

 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNK

These are the kind of text the untrained network will generate:

```
UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
 UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
```