In [4]:
import numpy as np

### Data IO

In [1]:
# should be simple plain text file
data = open('input.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)

print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

data has 11 characters, 8 unique.


In [2]:
# size of hidden layer of neurons
hidden_size = 100
# number of steps to unroll the RNN for
seq_length = 25
learning_rate = 1e-1

In [5]:
weights_penalization = 0.01

# input to hidden
w_xh = np.random.randn(hidden_size, vocab_size) * weights_penalization 
# hidden to hidden
w_hh = np.random.randn(hidden_size, hidden_size) * weights_penalization
# hidden to output
w_hy = np.random.randn(vocab_size, hidden_size) * weights_penalization
# hidden bias
b_h = np.zeros((hidden_size, 1))
# output bias
b_y = np.zeros((vocab_size, 1))

In [6]:
def loss(inputs, targets, h_prev):
    """
    inputs,targets are both list of integers.
    h_prev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    :param inputs: 
    :param targets: 
    :param h_prev: 
    :return: 
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(h_prev)
    loss = 0
  
    # forward pass
    for t in range(len(inputs)):
        # encode in 1-of-k representation
        xs[t] = np.zeros((vocab_size,1)) 
        xs[t][inputs[t]] = 1
        # hidden state
        hs[t] = np.tanh(np.dot(w_xh, xs[t]) + np.dot(w_hh, hs[t-1]) + bh)
        # unnormalized log probabilities for next chars
        ys[t] = np.dot(w_hy, hs[t]) + by
        # probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        # softmax (cross-entropy loss)
        loss += -np.log(ps[t][targets[t], 0])
        
    # backward pass: compute gradients going backwards
    d_w_xh, d_w_hh, d_w_hy = np.zeros_like(w_xh), np.zeros_like(w_hh), np.zeros_like(w_hy)
    d_bh, d_by = np.zeros_like(bh), np.zeros_like(by)
    
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        d_w_hy += np.dot(dy, hs[t].T)
        d_by += dy
        dh = np.dot(w_hy.T, dy) + dhnext # backprop into h
        d_hraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        d_bh += d_hraw
        d_w_xh += np.dot(d_hraw, xs[t].T)
        d_w_hh += np.dot(d_hraw, hs[t-1].T)
        dhnext = np.dot(w_hh.T, dhraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [None]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    :param h: 
    :param seed_ix: 
    :param n: 
    :return: 
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    
    return ixes