In [18]:
import numpy as np

In [19]:
def lossFun(inputs, targets, hprev):
    """
    inputs,targets [1 x N] are both list of integers. (here N = seq_length)
    hprev is [Hx1] array of initial hidden state
    returns 
        loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    # xs: [N x [D x 1]]
    # hs: [N x [ [H x D] x [D x 1] ]] = [N x [H x 1]]
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # [H x 1] hidden state
        ys[t] = np.dot(Why, hs[t]) + by # [D x 1] unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # [D x 1] probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t]) # [D x 1]
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T) # [D x 1]
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw) # ??
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h [H x 1] is memory state, seed_ix is seed letter for first time step
    seed_ix - chart index
    n - sample size
    """
    x = np.zeros((vocab_size, 1)) # [D x 1]
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) # [H x D] x [D x 1] = [H x 1]
        y = np.dot(Why, h) + by # [D x H] x [H x 1] = [D x 1]
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [33]:
# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 44 characters, 19 unique.


In [34]:
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

In [35]:
# model parameters
# D = vocab_size

Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # [H x D] input to hidden 
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # [H x D] hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # [D x H] hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [36]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad

smooth_loss = -np.log(1.0/vocab_size) * seq_length # loss at iteration 0
while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % 100 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradients
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

----
 kyb fleoooehweplhplynkhebwubdeyhlbwk eftkfbw kawnckyfrddyawdteropebkwwkntpl bI khrtnepkppkcfoylapdblkwkc tecwrhulokIpoactycbdpyfkeffaorhdecuaIlotdnr wlcopfruwhobfedkIyukIwpbrfcedtIfnw lIlcdceIka lrtyh 
----
iter 0, loss: 73.610984
----
 f you cau  cap your head you can cpod your hehd yep cour head your heau your head yourcyer head yoe  hyn k an you  heu head you cau kead kou cau kanp your head your head your head you kooucanp cad cea 
----
iter 100, loss: 70.420630
----
 f you can keep your head your head your hdad your head your head your head your head your head your head your head your head your head your hee kean kou  head your head your head your head your head y 
----
iter 200, loss: 63.862252
----
 f you ceep you chead your head your head your hean your head your head your head your head your head your head your head your head your head your head your head your head your head youk head your head 
----
iter 300, loss: 57.828598
----
 f you can keep your head your head your 

----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 3500, loss: 2.382756
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 3600, loss: 2.157557
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 3700, loss: 1.953750
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 3800, loss: 1.769301
----
 f you can keep your head your head you

----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 7000, loss: 0.081431
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 7100, loss: 0.074465
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 7200, loss: 0.068149
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 7300, loss: 0.062422
----
 f you can keep your head your head you

----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head kour head your head your head your head your head your  
----
iter 10500, loss: 0.008064
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 10600, loss: 0.007786
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 10700, loss: 0.007530
----
 f you can keep your head your head your head your head your head your head your head you  head your head your head your head your head your head your head your head your head your head your head your  
----
iter 10800, loss: 0.007292
----
 f you can keep your head your head

----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head you can keep your head your head your head your head yo 
----
iter 14000, loss: 0.004192
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 14100, loss: 0.004150
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 14200, loss: 0.004110
----
 k you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 14300, loss: 0.004071
----
 f you can keep your head your head

----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 17500, loss: 0.003174
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 17600, loss: 0.003154
----
 f you can keep your head your head your head your head your head your head keep your head your head your head your head your head your head you can keep your head your head your head your head your he 
----
iter 17700, loss: 0.003133
----
 f you can keep your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your head your  
----
iter 17800, loss: 0.003113
----
 f you can keep your head your head

KeyboardInterrupt: 