## References
- [char-nn by Karpathy](https://github.com/karpathy/char-rnn)
- [RNN on WILDML](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/) and [code](https://github.com/dennybritz/rnn-tutorial-rnnlm)
- implement simple RNN (nor LSTM nor GRU) with theano

In [1]:
import numpy as np
import theano
import theano.tensor as T
from theano import shared, function

Using gpu device 0: GeForce GTX 980M (CNMeM is disabled, cuDNN 5005)


In [2]:
theano.config.device

'gpu'

## generate some simple artificial data to better understand the mechanism of RNN
- Using [The Four Millions](http://www.gutenberg.org/cache/epub/2776/pg2776.txt) from gutenberg project

In [3]:
texts = open("../data/4millions.txt").read()
charset = set(texts)
ind2char = dict(enumerate(charset))
char2ind = dict(map(reversed, ind2char.items()))
data = map(char2ind.get, texts)

print len(charset), len(data)

98 316457


## char RNN with explicit loop
- fixed window lenght
- Because the network is not really deep (max seq len is 5), so using simple weights initialization
- Use traditional `tanh` for linearity
- simple derivative clip between -5, 5

In [113]:
## variable dimensions
L = 25 # window length
D = len(charset) # word dimension
H = 100 # hidden dimension
lr = 0.01 # learning rate
lmbda = 0.01
###############################################################

x = T.ivector(name = "x") # seq of word hashs
y = T.ivector(name = "y") # seq of next-word hashs

################################################################

Wxh = shared(np.random.randn(D, H) / np.sqrt(D), name = "Wxh")
Whh = shared(np.random.randn(H, H) / np.sqrt(H), name = "Whh")
bh = shared(np.zeros(H), name = "bh")
Why = shared(np.random.randn(H, D) / np.sqrt(H), name = "Why")
by = shared(np.zeros(D), name = "by")

hs = [None] * (L+1)
probs = [None] * L
errors = [None] * L

hs[-1] = shared(np.zeros(H), name = "h_init")
for i in xrange(L):
    hs[i] = T.tanh(Wxh[x[i], :] + Whh.dot(hs[i-1]) + bh)
    probs[i] = T.nnet.softmax(hs[i].dot(Why) + by).flatten()
    errors[i] = -T.log(probs[i][y[i]])

data_loss = sum(errors)
reg_loss = (Wxh * Wxh).sum() + (Whh * Whh).sum() + (Why * Why).sum()
loss = data_loss + lmbda * reg_loss

dWxh = T.clip(T.grad(loss, Wxh), -5, 5)
dWhh = T.clip(T.grad(loss, Whh), -5, 5)
dbh = T.clip(T.grad(loss, bh), -5, 5)
dWhy = T.clip(T.grad(loss, Why), -5, 5)
dby = T.clip(T.grad(loss, by), -5, 5)

###################################################################
train_on_seq = function(inputs = [x, y], 
                        outputs = [loss], 
                        updates = [ (Wxh, Wxh - lr * dWxh)
                                  , (Whh, Whh - lr * dWhh)
                                  , (bh, bh - lr * dbh)
                                  , (Why, Why - lr * dWhy)
                                  , (by, by - lr * dby)])

predict_prob = function(inputs = [x], 
                  outputs = probs[-1])

predict = function(inputs = [x], 
                  outputs = probs[-1].argmax())

In [114]:
## test initialization
train_on_seq(x = [0] * L, y = [0] * L), -np.log(1./D) * L

([array(115.85033294086065)], 114.62418696676431)

In [115]:
## overfit a small dataset
ichar = 0
iteration = 0
iseq = 0
total_loss = 0
N = 1000
for iseq in xrange(10*N/L):
    xval = data[ichar:ichar+L]
    yval = data[ichar+1:ichar+1+L]
    loss = train_on_seq(xval, yval)[0]
    total_loss += loss
    ichar += L
    if ichar+1+L >= N: 
        ichar = 0
        iteration += 1
        print "iteration=%i, iseq=%i, ichar=%i, loss=%g" % (iteration, iseq, ichar, total_loss / (N*1. / L))
        total_loss = 0

iteration=1, iseq=38, ichar=0, loss=97.5848
iteration=2, iseq=77, ichar=0, loss=88.0225
iteration=3, iseq=116, ichar=0, loss=82.8493
iteration=4, iseq=155, ichar=0, loss=78.878
iteration=5, iseq=194, ichar=0, loss=75.9255
iteration=6, iseq=233, ichar=0, loss=73.195
iteration=7, iseq=272, ichar=0, loss=71.2562
iteration=8, iseq=311, ichar=0, loss=68.7904
iteration=9, iseq=350, ichar=0, loss=67.1693
iteration=10, iseq=389, ichar=0, loss=65.6779


In [107]:
ichar = 0
iteration = 0
iseq = 0
total_loss = 0
N = len(data)
for iseq in xrange(10*N/L):
    xval = data[ichar:ichar+L]
    yval = data[ichar+1:ichar+1+L]
    loss = train_on_seq(xval, yval)[0]
    total_loss += loss
    ichar += L
    if ichar+1+L >= N: 
        ichar = 0
        iteration += 1
        print "iteration=%i, iseq=%i, ichar=%i, loss=%g" % (iteration, iseq, ichar, total_loss / (N*1. / L))
        total_loss = 0

iteration=1, iseq=12657, ichar=0, loss=60.5135
iteration=2, iseq=25315, ichar=0, loss=60.5437
iteration=3, iseq=37973, ichar=0, loss=60.5185
iteration=4, iseq=50631, ichar=0, loss=60.4995
iteration=5, iseq=63289, ichar=0, loss=60.4887
iteration=6, iseq=75947, ichar=0, loss=60.4934
iteration=7, iseq=88605, ichar=0, loss=60.5534
iteration=8, iseq=101263, ichar=0, loss=60.4818
iteration=9, iseq=113921, ichar=0, loss=60.4696
iteration=10, iseq=126579, ichar=0, loss=60.4553


## word Rnn with theano scan