# Probabilistic Time Series Analysis

## Week 5: Recurrent Neural Networks

Places where you are supposed to fill in code are marked

    #
    # TODO: some instructions
    # 
    
The rest of the code we will run and discuss if time permits, otherwise try it out at home and try to answer the questions mentioned in the text boxes for yourself.

---

In [None]:
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

## Step 1: Data Preprocessing

In [None]:
# NOTE: When you try different datasets, change the filename in this line:
#
# I've gathered some text files for you to try, but feel free to use the code on your own that you find online.
# The prepared examples are (all in the data directory on Github):
# - catcher.txt
# - boh.txt
# - holmes.txt
# - war.txt
# - quotes.txt
data = open('../../data/catcher.txt', 'r').read()
data_size = len(data)
data = data + ' '

In [None]:
#
# TODO: Create a one-hot encoding for characters that occur in your text file by following the instructions below.
# 

# This should contain a list of unique characters:
chars = None

vocab_size = len(chars)

print 'Input data has %d characters, %d unique.' % (data_size, vocab_size)

# These should be dictionaries of character-to-index in `chars` and index-to-character:
char_to_ix = None
ix_to_char = None

# Some error checking for you
assert set(char_to_ix.keys()) == set(ix_to_char.values())
assert set(ix_to_char.keys()) == set(char_to_ix.values())

## Step 2: Training

In [None]:
# Hyperparameters
hidden_size = 100  # size of hidden layer of neurons
seq_length = 50    # number of steps to unroll the RNN for
learning_rate = 5e-2
maxiters = 20000

# Model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in xrange(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(xrange(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n, temp=1.0):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y / temp) / np.sum(np.exp(y / temp))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

unsmoothed_losses = []
smoothed_losses = []
gradients = []

while n < maxiters:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % 1000 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print '----\n %s \n----' % (txt, )

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if n % 200 == 0:
        unsmoothed_losses.append(loss)
        smoothed_losses.append(smooth_loss)
        gradients.append((dWxh, dWhh, dWhy, dbh, dby))
    
    if n % 500 == 0: 
        print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                  [dWxh, dWhh, dWhy, dbh, dby], 
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

## Step 3: Evaluation

In [None]:
# 
# TODO: The variables `unsmoothed_losses` and `smoothed_losses` contain periodically 
# sampled training losses, and in the latter we take a moving average. Plot these two
# in this cell. What do you notice?
#

old_smoothed_losses = np.array(smoothed_losses)
old_unsmoothed_losses = np.array(unsmoothed_losses)

# Plot using the `old_` variables, since you're going to re-run the code below with
# different parameters.
plt.plot(None)

In [None]:
# 
# TODO: Change the learning rate, run the training code again, and plot the new results
# (leave the old results in the cell above). What do you notice? How could we tune this
# hyperparameter? If you have time, try modifying the code to vary the learning rate
# over time. See what happens.
# 

# Now plot using the variables without `old_`, after re-running.
plt.plot(None)

In [None]:
#
# TODO: The variable `gradients` is a list of tuples of gradients for each of the variables
# in the network (some are matrices). Compute the norms of the gradients of the variables
# at each time step and make plots (use np.linalg.norm(X, ord='fro') for matrices). What
# do you notice? Do you observe convergence? You might want to try smoothing the gradient norms.
#

# Example of contents of gradients:
for value, name in zip(gradients[0], ['dWxh', 'dWhh', 'dWhy', 'dbh', 'dby']):
    print name, ':'
    print value
    print ''

In [None]:
#
# TODO: Here is an example of sampling from the trained network. The parameter `temp` can
# be tuned to vary the probability distribution from which we sample. The default is 1.0.
# Pass in different values and print a few samples with each. What's the difference?
#

# Example of sampling:
sample_ix = sample(hprev, inputs[0], 200, temp=1.0)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print txt

# References

These lab materials are based on Andrej Karpathy's blog post and implementation:

https://karpathy.github.io/2015/05/21/rnn-effectiveness/

https://gist.github.com/karpathy/d4dee566867f8291f086