# Character-level RNN model

Author: Alex Beatson

Data I/O adapted from Andrej Karpathy's CharRNN gist: https://gist.github.com/karpathy/d4dee566867f8291f086

See his blog post for some fun applications of RNNs: http://karpathy.github.io/2015/05/21/rnn-effectiveness/

BSD License

## Design notes:

All TensorFlow computation is wrapped in the RNN class.
All non-TF computation (except feeding inputs) happens outside the class.

"private" class methods preceeded by underscore (e.g. _init_params, _rnn_step) accessed within RNN.

"public" class methods without underscore (run_train, run_sample) accessed outside RNN.

All placeholders are defined in _build_graph, and all placeholder values are fed in by public methods.

## Student note:

You should focus on understanding the RNN methods _init_params, _rnn_step, and _forward.

In [1]:
import numpy as np
import tensorflow as tf

In [None]:
class RNN(object):

    def __init__(self, batch_size, embedding_size, hidden_size, vocab_size, seq_length,
                 learning_rate, decay_steps, decay_factor, max_grad, sample_len):
        ''' Set the hyperparameters and define the computation graph.
        '''

        ''' hyperparameters '''

        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size # number of chars in vocab
        self.seq_length = seq_length # number of steps to unroll the RNN for
        self.initial_learning_rate = learning_rate
        self.decay_steps = decay_steps
        self.decay_factor = decay_factor
        self.max_grad = max_grad
        self.sample_len = sample_len

        # this var keeps track of the train steps within the RNN
        self.global_step = tf.Variable(0, trainable=False)

        ''' create vars and graph '''

        self._init_params()

        self._build_graph()


    def _init_params(self):
        '''Create the model parameters'''
        
        # We learn an embedding for each character jointly with the other model params
        self.embedding = tf.Variable(tf.random_normal([self.vocab_size, self.embedding_size],
                                                      mean=0, stddev=0.2))

        self.U = tf.Variable(tf.random_normal([self.embedding_size, self.hidden_size],
                                       mean=0, stddev=0.2))
            
        self.W = tf.Variable(tf.random_normal([self.hidden_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        
        self.bh = tf.Variable(tf.zeros([1, self.hidden_size]))

        self.V = tf.Variable(tf.random_normal([self.hidden_size, self.vocab_size],
                                               mean=0, stddev=0.2))
        
        self.by = tf.Variable(tf.zeros([1, self.vocab_size]))


    def _rnn_step(self, x, h):
        '''Performs RNN computation for one timestep:
        takes a previous x and h, and computes the next x and h.
        
        In practical applications, you should almost always use TensorFlow's built-in RNN cells,
        from tf.contrib.rnn. However for teaching purposes we are writing the RNN from scratch here.
        '''
        
        h = tf.nn.sigmoid(tf.matmul(x, self.U) + tf.matmul(h, self.W) + self.bh)
        y = tf.matmul(h, self.V) + self.by

        return y, h

    
    def _forward(self, inputs):
        '''Performs the forward pass for all timesteps in a sequence.
        '''
        # Create list to hold y
        y = [_ for _ in range(self.seq_length)]

        # Create zero-d initial hidden state
        h = tf.zeros([self.batch_size, self.hidden_size])

        for t in range(self.seq_length):
            x = tf.nn.embedding_lookup(self.embedding, inputs[:, t])
            y[t], h = self._rnn_step(x, h)

        return y

    
    def _sample_one(self, input_character, input_hidden, temperature):
        '''Sample the single next character in a sequence.

        We can use this to sample sequences of any length w/o having to alter
        the tensorflow graph.'''

        # We expand dims because tf expects a batch
        character = tf.expand_dims(input_character, 0)

        # Get the embedding for the input character
        x = tf.nn.embedding_lookup(self.embedding, character)

        # Perform the RNN look up
        y, h = self._rnn_step(x, input_hidden)

        # Dividing the unnormalized probabilities by the temperature before 
        # tf.multinomial is equivalent to adding temperature to a softmax
        # before sampling
        y_temperature = y / temperature

        # We use tf.squeeze to remove the unnecessary [batch, num_samples] dims
        # We do not manually softmax - tf.multinomial softmaxes the tensor we pass it
        next_sample = tf.squeeze(tf.multinomial(y_temperature, 1))

        return next_sample, h


    def _build_graph(self):
        '''Build the computation graphs for training and sampling.

        All placeholders are defined in this method.'''


        '''Sampling graph'''
        self.sample_input_char = tf.placeholder(dtype=tf.int32, shape=[])
        self.sample_input_hidden = tf.placeholder(dtype=tf.float32, shape=[1, self.hidden_size])

        self.temperature = tf.placeholder(tf.float32, shape=[])

        self.next_sample, self.next_hidden = self._sample_one(
            self.sample_input_char, self.sample_input_hidden, self.temperature)


        '''Training graph'''
        self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length])
        self.targets = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length])
        self.predictions = self._forward(self.inputs)

        cost_per_timestep_per_example = [
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.predictions[t],
                    labels=self.targets[:, t])
                for t in range(self.seq_length)
        ]

        # Use reduce_mean over the examples in batch so that we don't need to
        # change the learning rate when we change the batch size.
        cost_per_timestep = [tf.reduce_mean(cost) for cost in cost_per_timestep_per_example]

        # Total cost is cost summed over timesteps.
        self.cost = tf.reduce_sum(cost_per_timestep)

        # Decay the learning rate according to a schedule.
        self.learning_rate = tf.train.exponential_decay(self.initial_learning_rate,
                                                        self.global_step,
                                                        self.decay_steps,
                                                        self.decay_factor)
        
        self.train_step = tf.train.RMSPropOptimizer(self.learning_rate).minimize(
            self.cost, global_step=self.global_step)


        '''Finished creating graph: start session and init vars'''
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())


    def run_train(self, input_chars, target_chars):
        '''Call this from outside the class to run a train step'''
        cost, lr, _ = self.sess.run([self.cost, self.learning_rate, self.train_step],
                                   feed_dict={
                                       self.inputs: input_chars,
                                       self.targets: target_chars
                                   })
        return cost, lr


    def run_sample(self, n, starter_character, temperature=1.0):
        '''Call this from outside the class to sample a length-n sequence from the model'''

        sampled_chars = [_ for _ in range(n)]
        current_char = starter_character
        h = np.zeros([1, self.hidden_size])

        for i in range(n):

            current_char, h = self.sess.run(
                [self.next_sample, self.next_hidden],
                feed_dict={
                    self.sample_input_char: current_char,
                    self.sample_input_hidden: h,
                    self.temperature: temperature})

            sampled_chars[i] = current_char

        return sampled_chars

In [None]:
'''Train and sample from our model'''

# data I/O
data = open('shakespeare.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }


# hyperparameters
embedding_size = 256 # size of embedding
hidden_size = 256 # size of hidden layers of neurons
seq_length = 100 # number of steps to unroll the RNN for
learning_rate = 1e-4
max_grad = 5
decay_steps = 10000
decay_factor = 0.99
sample_len = 500

batch_size = 32

n_train_steps = 100000

# model parameters
rnn = RNN(batch_size, embedding_size, hidden_size, vocab_size, 
          seq_length, learning_rate, decay_steps, decay_factor, 
          max_grad, sample_len)

smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

for n in range(n_train_steps):
    
    # prepare inputs 
    inputs = np.empty([batch_size, seq_length])
    targets = np.empty([batch_size, seq_length])
    
    for i in range(batch_size):
        # randomly index into the data for each example in batch
        random_index = int(np.random.rand() * (data_size - seq_length - 1))
        inputs[i, :] = [char_to_ix[ch] for ch in data[random_index:random_index+seq_length]]
        targets[i, :] = [char_to_ix[ch] for ch in data[random_index+1:random_index+seq_length+1]]
        
    loss, lr = rnn.run_train(inputs, targets)
    
    # print progress
    if n % 100 == 0: 
        print 'iter %d, loss: %f, learning rate: %f' % (n, loss, lr) 

    # sample from the model now and then
    if n % 1000 == 0:
        sample_ix = rnn.run_sample(sample_len, inputs[0, 0], 1.0)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print '----\n %s \n----' % (txt, )

data has 1115394 characters, 65 unique.
iter 0, loss: 631.948120, learning rate: 0.000100
----
 ,:Iv,lUV,vP;EaPvIlmP&PvPIE;Dl;;l,FPPElUvEmllE,MEMv,lyPDEv&bEl,&&,lEUlIelPID?llllEePgGlufPvsDvU;aPPj-Us;:lEDdE&PPl,alIMlIPD&,r&EXDMMEdUMylDbIvjvUElg:P:lEUP'llIH;EPEPlvXEHlEIl;jjbIl&v&P&PlUfMPvEUvvlBuMvIPalIPvxl&VynPPPlDaPll;,&?g&PPlPUDlvv;vDl'HlDMM,cv&ll&PPUlbvXEPPEP;,,MUI:vEX&BvvBMlxvl'l';XqY&IP&UjlEPvlflE,lIIU:HXa;I!,l;llPHnE&aUEll:xU-zPPlMuPFP,EXDOPvE;,jXlluvXUxXvPl,&lM;,yEEl-'PPUPIPuPljHXlPvjMl3,EBllPPHP:ll&WX,lglPU?P:,DliMMKuP:MW&PEPyH&lMuPldD,&lMlE;lDl;l'jV&PI3gPPMl,M'llXTv3PP;V?EPjExMlyP&U 
----
iter 100, loss: 346.171997, learning rate: 0.000100
iter 200, loss: 300.678925, learning rate: 0.000100
iter 300, loss: 282.154327, learning rate: 0.000100
iter 400, loss: 265.376404, learning rate: 0.000100
iter 500, loss: 252.267639, learning rate: 0.000100
iter 600, loss: 250.253036, learning rate: 0.000100
iter 700, loss: 240.579346, learning rate: 0.000100
iter 800, loss: 234.779083, learn

## What is the cost after 25,000 train steps?

*Insert answer here*

## Let's try sampling with high temperature:

In [None]:
sample_ix = rnn.run_sample(sample_len, inputs[0, 0], 100)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print '----\n %s \n----' % (txt, )

## Now with very low temperature:

In [None]:
sample_ix = rnn.run_sample(sample_len, inputs[0, 0], 0.001)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print '----\n %s \n----' % (txt, )

## How do the samples qualitatively change? What effect does temperature have, mathematically, on the output distribution?

In the softmax function with a temperature T, we use e^(x_i / T) instead of e^(x_i):

output_i = e^(x_i / T) / Z,
where Z is the normalizer: Z = sum_j e^(x_j / T)

*Insert answer here*