<a href="https://colab.research.google.com/github/dcheney01/DeepLearning/blob/main/Sequence_to_Sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Description:
This is my version of [Karpathy's char-rnn model](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). This is a recurrent neural network that is trained probabilistically on sequences of characters, and that can then be used to sample new sequences that are like the original.

I implemented my own version of Pytorch's GRU and trained this model on [SciFi Stories Text Corpus.](https://www.kaggle.com/jannesklaas/scifi-stories-text-corpus) The model ended up learning punctuation pretty well, but the spelling needs some work.

I used [Pytorch sequence-to-sequence tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) and [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) as resources for this project

In [None]:
! tar -xzf text_files.tar.gz
! pip install unidecode
! pip install torch

import unidecode
import string
import random
import re

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time

from tqdm.notebook import tqdm
import pdb
 
all_characters = string.printable
n_characters = len(all_characters)

tar (child): text_files.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now
Collecting unidecode
  Downloading Unidecode-1.3.3-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 12.4 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.3


In [None]:
# Turn string into list of longs
def char_tensor(string):
  tensor = torch.zeros(len(string)).long()
  for c in range(len(string)):
      tensor[c] = all_characters.index(string[c])
  return tensor

In [None]:
# grab random chunk from text dataset
def random_chunk(file, chunk_len):
  start_index = random.randint(0, len(file) - chunk_len)
  end_index = start_index + chunk_len + 1
  return file[start_index:end_index]

def random_training_set(file, chunk_len):    
  chunk = random_chunk(file, chunk_len)
  inp = char_tensor(chunk[:-1])
  target = char_tensor(chunk[1:])
  return inp, target

In [None]:
class GRU(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(GRU, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    # nn.module list and add a new linear layer for each layer
    self.ir = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])
    self.hr = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])

    self.iz = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])
    self.hz = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])

    self.i_n = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])
    self.hn = nn.ModuleList([nn.Linear(input_size, hidden_size) for i in range(num_layers)])


  def forward(self, inputs, hidden):
    # output of forward is the next hidden layer
    xt = inputs
    hidden_list = []

    for layer in range(self.num_layers):
      ht = hidden[layer]
      ir = self.ir[layer]
      hr = self.hr[layer]
      iz = self.iz[layer]
      hz = self.hz[layer]
      i_n = self.i_n[layer]
      hn = self.hn[layer]

      reset_gate = torch.sigmoid(ir(xt) + hr(ht))
      update_gate = torch.sigmoid(iz(xt) + hz(ht))
      n_t = torch.tanh(i_n(xt) + (reset_gate * hn(ht)))
      ht_new = (1 - update_gate) * n_t + (update_gate * ht)

      # input to the next layer is 
      xt = ht_new
      # store ht_new for each layer
      hidden_list.append(ht_new.unsqueeze(0))
      
    return ht_new, torch.cat(hidden_list, dim=0)

In [None]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, n_layers=1):
    super(RNN, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = GRU(input_size, hidden_size, n_layers)
    # Translates from a hidden layer to actual output
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input_char, hidden):
    out_decoded = F.relu(self.embedding(input_char).view(1, 1, -1))
    out_decoded, hidden = self.gru(out_decoded, hidden)
    out_decoded = self.out(out_decoded[0])
    return out_decoded, hidden

  def init_hidden(self):
    return torch.zeros(self.n_layers, 1, self.hidden_size)

In [None]:
def train(decoder, decoder_optimizer, inp, target):
  decoder_optimizer.zero_grad()
  loss = 0

  input_length = inp.size(0)
  target_length = target.size(0)

  decoder_hidden = decoder.init_hidden()
  decoder_input = inp
  
  for di in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input[di], decoder_hidden)
    loss += criterion(decoder_output, target[di].unsqueeze(dim=0))
  
  loss.backward()
  decoder_optimizer.step()

  return loss.item()

In [None]:
def sample_outputs(output, temperature):
    """Takes in a vector of unnormalized probability weights and samples a character from the distribution"""
    return torch.multinomial(torch.exp(output / temperature), 1)

def evaluate(decoder, prime_str='A', predict_len=100, temperature=0.8):
  ## initialize hidden state, initialize other useful variables
  with torch.no_grad():

    decoder_hidden = decoder.init_hidden()

    decoded_words = []
    
    decoded_words.append(prime_str)

    for char in prime_str:
      decoder_output, decoder_hidden = decoder(char_tensor(char), decoder_hidden)

    decoder_input = char_tensor(prime_str[-1])

    for di in range(predict_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      decoder_input = sample_outputs(decoder_output.squeeze(dim=0).squeeze(dim=0), temperature)
      decoded_words.append(all_characters[decoder_input.item()])

  return ''.join(decoded_words)

In [None]:
def train_model(decoder, decoder_optimizer, n_epochs, predict_len, file, chunk_len, print_every):
  epoch_progress = tqdm(total=n_epochs, position=0, leave=False)
  start = time.time()

  all_losses = []
  loss_avg = 0

  for epoch in range(1, n_epochs + 1):
          # train(decoder, decoder_optimizer, inp, target):
    loss_ = train(decoder, decoder_optimizer, *random_training_set(file, chunk_len))       
    loss_avg += loss_

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time.time() - start, epoch, epoch / n_epochs * 100, loss_))
        print(evaluate(decoder, 'Wh', predict_len), '\n')

    if epoch % 10 == 0:
        all_losses.append(loss_avg / 10)
        loss_avg = 0
    
    epoch_progress.set_description('epoch:{} loss:{:.4f} '.format(epoch, loss_))
    epoch_progress.update(1)

  print("Done Training, output below:")

def output_model(decoder, predict_len):
    for i in range(10):
      start_strings = [" Th", " wh", " he", " I ", " ca", " G", " lo", "r", "T", "J", "B", "lo", "we", "po"]
      start = random.randint(0,len(start_strings)-1)
      print(start_strings[start])
      print(evaluate(decoder, start_strings[start], predict_len), '\n')  

In [None]:
# Run for SciFi dataset
scifi_file = unidecode.unidecode(open('/content/internet_archive_scifi_v3.txt').read())

#               RNN(input_size,   hidden_size, output_size,  n_layers=1):
scifi_decoder = RNN(n_characters, 100,         n_characters, 3)

scifi_optimizer = torch.optim.Adam(scifi_decoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
 
# train_model(decoder,       decoder_optimizer,  n_epochs, predict_len, file,       chunk_len, print_every)
train_model(  scifi_decoder, scifi_optimizer,    10000,     100,         scifi_file, 200,       10000)

# output_model(decoder,       predict_len):
output_model(  scifi_decoder, predict_len=200)

  0%|          | 0/10000 [00:00<?, ?it/s]

[5316.354083299637 (10000 100%) 331.7824]
Whus and so the rate thace. About the direction one alter prout are gam. # Nical Russivise of Enjumped 

Done Training, output below:
r
rular or slaunded on his cautiously. "We don't yes -- there peed." "What have ruck back doing." "Let's seet of the beds to be bread things peridable. # Khy down and then wowen in the picked astonding a 

T
Tips of his desk to the selled about so which was all and on the great had endless to cold the liveral agail chad loares aptain to my change should do seour that she top stars that becail, good man bee 

we
wen back, he could exchand, someone this this some your told him on the stared you gill bit for through the seeting to even only the risons one before I heard strend many your flament your and study fou 

 he
 hel study plass of the must corrary manuse also down and reading his fast. He said the colds were in the hand by helping peristical soul time of the very love his barracing out the man ready levers s