# 1. RNN

To understand RNN, one should know some data are sequential  
Sequential data is literally data in sequence, that is, the order of matters

For non-sequential data types, e.g. sets, the below would be true
```python
{1, 2, 3} == {1, 3, 2}
```
But for sequential data:
```python
[1, 2, 3] != [1, 3, 2]
```
__Time series__ is a type of sequential data, which data points are recorded successively over a time period

## RNN Basics

In [None]:
import torch

In [None]:
seq = torch.arange(1., 16.)

print(type(seq))
print(seq)
print(seq.size())

In [None]:
# Number of previous data points to be taken in account
seq_length = 5
batch_size = len(seq) // seq_length
# Number of features
input_size = 1

In [None]:
X = seq.view(batch_size, seq_length, input_size)

print(X.size())

In [None]:
import torch.nn as nn

In [None]:
# Number of features in hidden state
hidden_size = 10
# Number of RNN layers stacked
num_layers = 1

In [None]:
singleRNN = nn.RNN(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    nonlinearity='tanh',
    batch_first=True,
    dropout=0,
    bidirectional=False
)

In [None]:
y, h = singleRNN(X)

print(y.size())    # (batch_size, seq_length, hidden_size * num_directions)
print(h.size())    # (num_layers * num_directions, batch_size, hidden_size)

## Image Classification with RNN

In [None]:
import torchvision
import torchvision.transforms as transforms

In [None]:
transform = transforms.Compose([
    transforms.ToTensor()
])

trainset = torchvision.datasets.MNIST(root='./mnist', train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./mnist', train=False, transform=transform)

In [None]:
batch_size = 1000
num_workers = 0

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [None]:
class ImageRNN(nn.Module):
    def __init__(self, batch_size, seq_length, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        
        self.rnn = nn.RNN(self.input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_size * self.seq_length, self.num_classes)

    def forward(self, x, h0):
        x = x.view(-1, 28, 28)    # (batch_size, channel, width, height) --> (batch_size, width as seq_length, height * channel as feature)
        out, _ = self.rnn(x, h0)    # (batch_size, seq_length, num_directions * hidden_size)
        out = out.reshape(-1, (self.seq_length * self.hidden_size))    # (batch, seq_length * num_directions * hidden_size)
        outputs = self.fc(out)    # (batch_size, num_classes)
        return outputs

In [None]:
import torch.optim as optim

In [None]:
seq_length = 28
input_size = 28
hidden_size = 50
num_layers = 1
num_classes = 10

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = ImageRNN(batch_size, seq_length, input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 10

model.train()
for epoch in range(epochs):
    train_loss = 0
    train_correct = 0

    for x, y in trainloader:
        x, y = x.to(device), y.to(device)
        h0 = torch.zeros(num_layers, batch_size, hidden_size).to(device)    # (num_layers * num_directions, batch_size, hidden_size)

        optimizer.zero_grad()
        outputs = model(x, h0)
        loss = criterion(outputs, y)
                
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        train_correct += predicted.eq(y).sum().item()
        
    train_loss = train_loss / len(trainloader)
    train_acc = train_correct / len(trainset)
        
    print('[%2d] TRAIN loss: %.4f, acc: %.4f' % (epoch + 1, train_loss, train_acc))

In [None]:
test_loss = 0
test_correct = 0
test_preds = []

model.eval()
with torch.no_grad():
    for x, y in testloader:
        x, y = x.to(device), y.to(device)
        h0 = torch.zeros(num_layers, batch_size, hidden_size).to(device)

        outputs = model(x, h0)
        loss = criterion(outputs, y)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        test_correct += predicted.eq(y).sum().item()
        
        test_preds.extend(predicted.tolist())
        
print('TEST loss: %.4f, acc: %.4f' % (test_loss/len(testloader), test_correct/len(testset)))

## Stacked RNN

In [None]:
batch_size = 3
input_size = 1
seq_length = 5
hidden_size = 10
num_layers = 4

In [None]:
stackedRNN = nn.RNN(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    batch_first=True
)

In [None]:
X = seq.view(batch_size, seq_length, input_size)

In [None]:
y, h_n = stackedRNN(X)

## Bi-directional RNN

In [None]:
biRNN = nn.RNN(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    batch_first=True,
    bidirectional=True
)

In [None]:
y, h_n = biRNN(X)

In [None]:
print(y.size())    # (batch_size, seq_length, hidden_size * num_directions)
print(h_n.size())    # (num_layers * num_directions, batch_size, hidden_size)

In [None]:
y_bi = y.view(batch_size, seq_length, 2, hidden_size)

print(y_bi.size())

In [None]:
y_forward = y_bi[:,:,0,:]
y_backward = y_bi[:,:,1,:]

print(y_forward.size())
print(y_backward.size())

In [None]:
h_n_bi = h_n.view(num_layers, 2, batch_size, hidden_size)

print(h_n_bi.size())

In [None]:
h_n_forward = h_n_bi[:,:,0,:]
h_n_backward = h_n_bi[:,:,1,:]

print(h_n_forward.size())
print(h_n_backward.size())

## LSTM


In [None]:
lstm = nn.LSTM(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    batch_first=True,
    dropout=0,
    bidirectional=False
)

In [None]:
y, h_n = lstm(X)

In [None]:
print(y.size())    # (batch_size, seq_length, hidden_size * num_directions)

## Character Prediction with RNN

In [None]:
char_set = ['d', 'e', 'h', 'l', 'o', 'r', 'w', ' ']

input_size = len(char_set)
hidden_size = 16
output_size = len(char_set)

In [None]:
x = [[2, 1, 3, 3, 4, 7, 6, 4, 5, 3]] # hello worl
x_onehot = [[[0, 0, 1, 0, 0, 0, 0, 0],  # h
             [0, 1, 0, 0, 0, 0, 0, 0],  # e
             [0, 0, 0, 1, 0, 0, 0, 0],  # l
             [0, 0, 0, 1, 0, 0, 0, 0],  # l
             [0, 0, 0, 0, 1, 0, 0, 0],  # o
             [0, 0, 0, 0, 0, 0, 0, 1],  #
             [0, 0, 0, 0, 0, 0, 1, 0],  # w
             [0, 0, 0, 0, 1, 0, 0, 0],  # o
             [0, 0, 0, 0, 0, 1, 0, 0],  # r
             [0, 0, 0, 1, 0, 0, 0, 0]]]

y = [[1, 3, 3, 4, 7, 6, 4, 5, 3, 0]] # ello world

X = torch.FloatTensor(x_onehot)
Y = torch.LongTensor(y)

In [None]:
class simpleRNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = torch.nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x, _status = self.rnn(x)
        x = self.fc(x)
        return x

In [None]:
model = simpleRNN(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), 0.1)

In [None]:
import numpy as np

In [None]:
epochs = 5

model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs.view(-1, input_size), Y.view(-1))

    loss.backward()
    optimizer.step()

    predicted = outputs.data.numpy().argmax(axis=2)
    prediction = ''.join([char_set[c] for c in np.squeeze(predicted)])
    print('[%2d] TRAIN loss: %.4f, pred: %s' % (epoch + 1, loss.item(), prediction))

## Gender Classficiation with RNN

In [None]:
char_set = ['a', 'd', 'e', 'h', 'i', 'n', 'o', 'p', 'r', 's', 'w']
input_size = len(char_set)
hidden_size = 22
output_size = 1

In [None]:
x = [[0, 5, 1, 8, 2, 10], # andrew,
     [9, 6, 7, 3, 4, 0]]  # sophia

x_onehot = [[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # a
             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # n
             [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # d
             [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],  # r
             [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # e
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], # w
            
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # s
             [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # o
             [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],  # p
             [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # h
             [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],  # i
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]] # a

y = [[0],  # Male
     [1]]  # Female

X = torch.FloatTensor(x_onehot)
Y = torch.FloatTensor(y)

In [None]:
class simpleRNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.rnn = torch.nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x, _status = self.rnn(x)
        x = self.fc(x)
        return x

In [None]:
model = simpleRNN(input_size, hidden_size, output_size)
criterion = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), 0.1)

In [None]:
epochs = 20

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs[:, -1, :].squeeze(), Y.view(-1))

    loss.backward()
    optimizer.step()

    predicted = ["Male" if x < 0.5 else "Female" for x in outputs[:, -1, :].squeeze().tolist()]
    print('[%2d] TRAIN loss: %.4f, pred: %s' % (epoch + 1, loss.item(), predicted))

## Seq2Seq

In [None]:
raw = ["I feel hungry.	나는 배가 고프다.",
       "Pytorch is very easy.	파이토치는 매우 쉽다.",
       "Pytorch is a framework for deep learning.	파이토치는 딥러닝을 위한 프레임워크이다.",
       "Pytorch is very clear to use.	파이토치는 사용하기 매우 직관적이다."]

In [None]:
# Fix token for "start of sentence" and "end of sentence"
SOS_token = 0
EOS_token = 1

In [None]:
# Class for vocabulary related information of data
class Vocab:
    def __init__(self):
        self.vocab2index = {"<SOS>": SOS_token, "<EOS>": EOS_token}
        self.index2vocab = {SOS_token: "<SOS>", EOS_token: "<EOS>"}
        self.vocab_count = {}
        self.n_vocab = len(self.vocab2index)

    def add_vocab(self, sentence):
        for word in sentence.split(" "):
            if word not in self.vocab2index:
                self.vocab2index[word] = self.n_vocab
                self.vocab_count[word] = 1
                self.index2vocab[self.n_vocab] = word
                self.n_vocab += 1
            else:
                self.vocab_count[word] += 1

In [None]:
# Filter out the long sentence from source and target data
def filter_pair(pair, source_max_length, target_max_length):
    return len(pair[0].split(" ")) < source_max_length and len(pair[1].split(" ")) < target_max_length

In [None]:
# Read and preprocess the corpus data
def preprocess(corpus, source_max_length, target_max_length):
    print("...Reading corpus...")
    pairs = []
    for line in corpus:
        pairs.append([s for s in line.strip().lower().split("\t")])
    print("Read {} sentence pairs".format(len(pairs)))

    pairs = [pair for pair in pairs if filter_pair(pair, source_max_length, target_max_length)]
    print("Trimmed to {} sentence pairs".format(len(pairs)))

    source_vocab = Vocab()
    target_vocab = Vocab()

    print("...Counting words...")
    for pair in pairs:
        source_vocab.add_vocab(pair[0])
        target_vocab.add_vocab(pair[1])
    print("source vocab size =", source_vocab.n_vocab)
    print("target vocab size =", target_vocab.n_vocab)

    return pairs, source_vocab, target_vocab

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        return x, hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        x = self.softmax(self.out(x[0]))
        return x, hidden

In [None]:
# Convert sentence to the index tensor
def tensorize(vocab, sentence):
    indexes = [vocab.vocab2index[word] for word in sentence.split(" ")]
    indexes.append(vocab.vocab2index["<EOS>"])
    return torch.Tensor(indexes).long().to(device).view(-1, 1)

In [None]:
# Training seq2seq
def train(pairs, source_vocab, target_vocab, encoder, decoder, n_iter, print_every=1000, learning_rate=0.01):
    loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    training_batch = [random.choice(pairs) for _ in range(n_iter)]
    training_source = [tensorize(source_vocab, pair[0]) for pair in training_batch]
    training_target = [tensorize(target_vocab, pair[1]) for pair in training_batch]

    criterion = nn.NLLLoss()

    for i in range(1, n_iter + 1):
        source_tensor = training_source[i - 1]
        target_tensor = training_target[i - 1]

        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size]).to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        source_length = source_tensor.size(0)
        target_length = target_tensor.size(0)

        loss = 0

        for enc_input in range(source_length):
            _, encoder_hidden = encoder(source_tensor[enc_input], encoder_hidden)

        decoder_input = torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden = encoder_hidden # connect encoder output to decoder input

        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # teacher forcing

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        loss_iter = loss.item() / target_length
        loss_total += loss_iter

        if i % print_every == 0:
            loss_avg = loss_total / print_every
            loss_total = 0
            print("[{} - {}%] loss = {:05.4f}".format(i, i / n_iter * 100, loss_avg))

In [None]:
# Insert given sentence to check the training
def evaluate(pairs, source_vocab, target_vocab, encoder, decoder, target_max_length):
    for pair in pairs:
        print(">", pair[0])
        print("=", pair[1])
        source_tensor = tensorize(source_vocab, pair[0])
        source_length = source_tensor.size()[0]
        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size]).to(device)

        for ei in range(source_length):
            _, encoder_hidden = encoder(source_tensor[ei], encoder_hidden)

        decoder_input = torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden = encoder_hidden
        decoded_words = []

        for di in range(target_max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.data.topk(1)
            if top_index.item() == EOS_token:
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(target_vocab.index2vocab[top_index.item()])

            decoder_input = top_index.squeeze().detach()

        predict_words = decoded_words
        predict_sentence = " ".join(predict_words)
        print("<", predict_sentence)
        print("")

In [None]:
# Declare max length for sentence
SOURCE_MAX_LENGTH = 10
TARGET_MAX_LENGTH = 12

In [None]:
import random

In [None]:
# Preprocess the corpus
load_pairs, load_source_vocab, load_target_vocab = preprocess(raw, SOURCE_MAX_LENGTH, TARGET_MAX_LENGTH)
print(random.choice(load_pairs))

In [None]:
# Declare the encoder and the decoder
enc_hidden_size = 16
dec_hidden_size = enc_hidden_size
enc = Encoder(load_source_vocab.n_vocab, enc_hidden_size).to(device)
dec = Decoder(dec_hidden_size, load_target_vocab.n_vocab).to(device)

In [None]:
# Train seq2seq model
train(load_pairs, load_source_vocab, load_target_vocab, enc, dec, 5000, print_every=1000)

In [None]:
# Check the model with given data
evaluate(load_pairs, load_source_vocab, load_target_vocab, enc, dec, TARGET_MAX_LENGTH)