# Character-level RNN model

Based on Python version by Alex Beatson

Translation to Julia by Sebastian Seung

Data I/O adapted from Andrej Karpathy's CharRNN gist: https://gist.github.com/karpathy/d4dee566867f8291f086

See his blog post for some fun applications of RNNs: http://karpathy.github.io/2015/05/21/rnn-effectiveness/

BSD License

## Design notes:
The composite type `RNN` contains the parameters and hyperparameters and nodes of the computation graph. 

Methods are associated with types through [multiple dispatch](https://docs.julialang.org/en/stable/manual/methods/#man-methods).

Methods preceded by underscore (e.g. `_init_params`, `_rnn_step`) contain TF functions and are used to build the computation graphs for training and sampling. Placeholders are defined in `_build_graph`.

Methods without underscore (`run_train`, `run_sample`) run a TF session and feed placeholder values but otherwise contain no TF functions.

## Student note:
You should focus on understanding the RNN methods `_init_params`, `_rnn_step`, and `_forward`.

In [None]:
using TensorFlow
using Distributions   # for sampling from Categorical distribution

### `RNN` is a [composite type](https://docs.julialang.org/en/stable/manual/types/#composite-types) in Julia, which is analogous to an object in other languages.

In [None]:
type RNN
    """ hyperparameters (initialized by inner constructor)"""
    batch_size
    embedding_size
    hidden_size
    vocab_size      # number of chars in vocab
    seq_length      # number of time steps to unroll the RNN
    initial_learning_rate
    decay_steps     # not used, as annealing the learning rate is not currently implemented in Julia version
    decay_factor    #

    global_step

    """ neural net weights and biases """
    embedding
    U
    W
    bh
    V
    by

    """Sampling graph"""
    sample_input_char
    sample_input_hidden
    next_y
    next_hidden

    """Training graph"""
    inputs
    targets
    predictions
    cost
    train_step
    learning_rate

    sess

    # inner constructor
    RNN(batch_size, embedding_size, hidden_size, vocab_size, seq_length, initial_learning_rate, decay_steps, decay_factor, global_step = Variable(0.0, trainable=false) ) =
        new(batch_size, embedding_size, hidden_size, vocab_size, seq_length, initial_learning_rate, decay_steps, decay_factor, global_step)
end

function _init_params(self::RNN)    # Create/initialize the trainable parameters
    self.embedding = Variable(0.2*randn(Float32, self.vocab_size, self.embedding_size))   # learn embedding for each character
    self.U = Variable(0.2*randn(Float32, self.embedding_size, self.hidden_size))
    self.W = Variable(0.2*randn(Float32, self.hidden_size, self.hidden_size))
    self.bh = Variable(zeros(Float32, 1, self.hidden_size))
    self.V = Variable(0.2*randn(Float32, self.hidden_size, self.vocab_size))
    self.by = Variable(zeros(Float32, 1, self.vocab_size))
end

function _rnn_step(self::RNN, x, h)
    """Performs RNN computation for one timestep:
        takes a previous x and h, and computes y (prediction of next x) and h.
            
        In practical applications, you should almost always use TensorFlow's built-in RNN cells,
        from tf.contrib.rnn. However for teaching purposes we are writing the RNN from scratch here.
    """
    h = nn.sigmoid(x * self.U + h * self.W + self.bh)
    y = h * self.V + self.by

    return y, h
end
    
function _forward(self::RNN, inputs)
    """Performs the forward pass for all timesteps in a sequence."""

    # Create list to hold y
    y = Array{TensorFlow.Tensor{Float32},1}(self.seq_length)
    
    # Create zero-d initial hidden state
    h = constant(zeros(Float32, self.batch_size, self.hidden_size))
    
    for t = 1:self.seq_length
        x = cast(nn.embedding_lookup(self.embedding, inputs[:, t]),Float32)
        y[t], h = _rnn_step(self, x, h)
    end

    return y
end
    
function _rnn_step_char(self::RNN, input_character, h)
    """This is like _rnn_step, except that the input is a character rather than an embedding vector.

       This is used for sequence generation w/o having to alter
       the tensorflow graph."""

    # We expand dims because tf expects a batch
    character = expand_dims(input_character, 1)

    # Get the embedding for the input character
    x = nn.embedding_lookup(self.embedding, character)
    
    # Perform the RNN look up
    y, h = _rnn_step(self, x, h)

    return y, h
end

function _build_graph(self::RNN)
    """Build the computation graphs for training and sampling.

        All placeholders are defined in this method."""

    """Sampling graph"""
    self.sample_input_char = placeholder(Int32, shape=[])
    self.sample_input_hidden = placeholder(Float32, shape=[1, self.hidden_size])
    
    self.next_y, self.next_hidden = _rnn_step_char(self,
       self.sample_input_char, self.sample_input_hidden)

    """Training graph"""
    self.inputs = placeholder(Int32, shape=[-1, self.seq_length])
    self.targets = placeholder(Int32, shape=[-1, self.seq_length])
    self.predictions = _forward(self, self.inputs)

    cost_per_timestep_per_example = [
        nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.predictions[t],
            labels = self.targets[:, t])
        for t=1:self.seq_length
    ]

    # Use reduce_mean over the examples in batch so that we don't need to
    # change the learning rate when we change the batch size.
    cost_per_timestep = [reduce_mean(cost) for cost in cost_per_timestep_per_example]
    
    # Total cost is cost averaged over timesteps.
    self.cost = mean(cost_per_timestep)

    # for annealing the learning rate (not currently used)
    self.learning_rate = self.initial_learning_rate * self.decay_factor ^ (self.global_step/self.decay_steps)

    self.train_step = train.minimize(train.AdamOptimizer(self.initial_learning_rate, 0, .9, 1e-10, "Adam"), self.cost)

    """Finished creating graph: start session and init vars"""
    self.sess = Session()  

    run(self.sess, global_variables_initializer())
end

In [None]:
function run_train(self::RNN, input_chars, target_chars)
    """Call this to run a train step"""
    cost, lr, _ = run(self.sess, [self.cost, self.learning_rate, self.train_step],
                      Dict(self.inputs => input_chars,
                           self.targets => target_chars
                           )
                  )
    return cost, lr
end

function run_sample(self::RNN, n, starter_character, temperature=1.0)
    """Call this to sample a length-n sequence from the model"""   
    
    sampled_chars = [_ for _=1:n]
    current_char = starter_character
    h = zeros(Float32, 1, self.hidden_size)

    for i in 1:n
        current_output, h = run(self.sess, [self.next_y, self.next_hidden],
                              Dict(self.sample_input_char => current_char,
                                   self.sample_input_hidden => h
                                   )
                              )
        probs = exp((current_output-maximum(current_output))/temperature)
        probs = probs/sum(probs)
        current_char = rand(Categorical(probs[:]))
        sampled_chars[i] = current_char
    end
    return sampled_chars
end

In [None]:
# data I/O
fid = open("shakespeare.txt") # should be simple plain text file
data = readstring(fid)
close(fid)
chars = unique(data)
data_size, vocab_size = length(data), length(chars)
@printf "data has %d characters, %d unique.\n" data_size vocab_size
char_to_ix = Dict(chars[i] => i for i=1:vocab_size )
ix_to_char = Dict(i => chars[i] for i=1:vocab_size )


# hyperparameters
embedding_size = 32 # size of embedding
hidden_size = 256 # size of hidden layers of neurons
seq_length = 50 # number of steps to unroll the RNN for
initial_learning_rate = 1e-2
decay_steps = 500.0
decay_factor = 0.9
sample_len = 500

batch_size = 128

n_train_steps = 100000

# model parameters
rnn = RNN(batch_size, embedding_size, hidden_size, vocab_size, 
          seq_length, initial_learning_rate, decay_steps, decay_factor)

_init_params(rnn)
_build_graph(rnn)

loss = zeros(n_train_steps)

# prepare inputs and target values
inputs = zeros(Int32, batch_size, seq_length)
targets = zeros(Int32, batch_size, seq_length)
    
for n = 1:n_train_steps
    for i = 1:batch_size
        # randomly index into the data for each example in batch
        random_index = Int32(ceil(rand() * (data_size - seq_length)))
        inputs[i, :] = [char_to_ix[ch] for ch in data[random_index:random_index+seq_length-1]]
        targets[i, :] = [char_to_ix[ch] for ch in data[random_index+1:random_index+seq_length]]
    end

    loss[n], lr = run_train(rnn, inputs, targets)

    # print progress
    if n % 100 == 0
        @printf("iter %d, loss: %f\n", n, loss[n])
    end

    # sample from the model now and then
    if n % 1000 == 0
        sample_ix = run_sample(rnn, sample_len, inputs[1, 1], 1.0)
        txt = string(map(string,[ix_to_char[ix] for ix in sample_ix])...)
        @printf("----\n %s \n----\n", txt)
    end
end

## What is the cost after 10,000 train steps (if using the default batch_size and seq_length)?

*Insert answer here*

## Let's try sampling with high temperature:

In [None]:
sample_ix = run_sample(rnn, sample_len, inputs[1, 1], 100.0)
txt = string(map(string,[ix_to_char[ix] for ix in sample_ix])...)
@printf("----\n %s \n----\n", txt)

In [None]:
## Now with very low temperature:

In [None]:
sample_ix = run_sample(rnn, sample_len, inputs[1, 1], 0.001)
txt = string(map(string,[ix_to_char[ix] for ix in sample_ix])...)
@printf("----\n %s \n----\n", txt)

## How do the samples qualitatively change? What does changing the temperature do to distribution of possible outputs?
In the softmax function with a temperature T, we use e^(x_i / T) instead of e^(x_i):
output_i = e^(x_i / T) / Z, where Z is the normalizer: Z = sum_j e^(x_j / T)

*Insert answer here*

Type Markdown and LaTeX:  $\alpha^2$