# Probabilistic Time Series Analysis

## Week 5: Recurrent Neural Networks (Part 2)

Places where you are supposed to fill in code are marked

    #
    # TODO: some instructions
    # 
    
You will need to install the PyTorch package. You can find instructions here that will generate a pip or conda command for you to run:

- https://pytorch.org/get-started/locally/

# Setup

In [None]:
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import argparse
import os
import string
import random
import time
import math

from tqdm import tqdm

%matplotlib inline

In [None]:
# Reading and un-unicode-encoding data

all_characters = string.printable
n_characters = len(all_characters)

def read_file(filename):
    file = open(filename).read()
    return file, len(file)

# Turning a string into a tensor

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor

# Readable time elapsed

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [None]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1):
        super(CharRNN, self).__init__()
        self.model = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        if self.model == "gru":
            self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        elif self.model == "lstm":
            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def forward2(self, input, hidden):
        encoded = self.encoder(input.view(1, -1))
        output, hidden = self.rnn(encoded.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        if self.model == "lstm":
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [None]:
def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))

    if cuda:
        hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = Variable(char_tensor(predicted_char).unsqueeze(0))
        if cuda:
            inp = inp.cuda()

    return predicted

# Select Dataset

In [None]:
filename = '../../data/holmes.txt'

file, file_len = read_file(filename)

# Training

In [None]:
#
# Hyperparameters
#

model = 'gru'  # or 'lstm'
n_epochs = 2000
print_every = 50
hidden_size = 100
n_layers = 2
learning_rate = 1e-2
chunk_len = 150
batch_size = 40

In [None]:
def random_training_set(chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    return inp, target

def train(inp, target):
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    decoder_optimizer.step()

    return loss.data[0] / chunk_len

# Initialize models and start training

decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=model,
    n_layers=n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

try:
    print "Training for %d epochs..." % n_epochs
    for epoch in tqdm(range(1, n_epochs + 1)):
        loss = train(*random_training_set(chunk_len, batch_size))
        loss_avg += loss
        all_losses.append(loss)
        
        if epoch % print_every == 0:
            print '[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss)
            print generate(decoder, 'Wh', 100, cuda=False)
            print '\n'
except e:
    print e

# Evaluation

In [None]:
# 
# TODO: The variable all_losses contains the per-epoch losses (these are longer time units than in the last lab).
# Plot these losses. Try changing the learning rate or other hyperparameters until you get different behavior, and
# plot those losses as well. What seems to affect the rate of decrease? Do you notice a difference from last time?
#

plt.plot(None)

In [None]:
#
# TODO: Here is an example of sampling from the trained network. The parameter `temperature` can
# be tuned to vary the probability distribution from which we sample, like before.
# Pass in different values and print a few samples with each. What's the difference?
# Do you notice a difference in quality for the same text with similar hyperparameters
# between the LSTM / GRU and the naive implementation from last week?
#

# Example of sampling:
print generate(decoder, prime_str='A', predict_len=200, temperature=0.2)

In [None]:
#
# TODO: You can also make the `prime_str` parameter longer. Try making it a word or phrase and see if the completion
# is reasonable.
#

# References

This code is based on the following Python package:

- https://github.com/spro/char-rnn.pytorch