# LSTM text generator

[Original video](https://youtu.be/WujVlF_6h5A)

In [None]:
!pip install Unidecode

!wget https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Projects/text_generation_babynames/data/example_names.txt
!wget https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Projects/text_generation_babynames/data/names.txt

!cat example_names.txt

Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |█▍                              | 10kB 16.7MB/s eta 0:00:01[K     |██▊                             | 20kB 20.2MB/s eta 0:00:01[K     |████                            | 30kB 15.9MB/s eta 0:00:01[K     |█████▍                          | 40kB 14.5MB/s eta 0:00:01[K     |██████▉                         | 51kB 11.9MB/s eta 0:00:01[K     |████████▏                       | 61kB 13.6MB/s eta 0:00:01[K     |█████████▌                      | 71kB 10.4MB/s eta 0:00:01[K     |██████████▉                     | 81kB 11.4MB/s eta 0:00:01[K     |████████████▏                   | 92kB 11.1MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 11.0MB/s eta 0:00:01[K     |███████████████                 | 112kB 11.0MB/s eta 0:00:01[K     |████████████████▎               | 12

In [None]:
import os
import sys
import torch
import string
import random
import unidecode
import torch.nn as nn

from torch.utils.tensorboard import SummaryWriter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# get characters from string.printable
all_characters = string.printable
n_characters = len(all_characters)

file = unidecode.unidecode(open('names.txt').read())  # unidecode to ASCII format

In [None]:
# Run TensorBoard

# Delete previous logs dir
log_dir = 'runs/names0'
if os.path.exists(log_dir):
    !rm -rf $log_dir

# To fix the error, because PyTorch and TensorFlow are installed both:
# AttributeError: module 'tensorflow._api.v2.io.gfile' has no attribute 'get_filesystem'
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Start TensorBoard before training to monitor it in progress
%tensorboard --logdir $log_dir

# Reload TensorBoard
%reload_ext tensorboard

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embed = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        out = self.embed(x)
        out, (hidden, cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0], -1))
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell


class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.chunk_len = 250
        self.num_epochs = 2000
        self.batch_size = 32
        self.print_every = 100
        self.hidden_size = 256
        self.num_layers = 2
        self.lr = 0.003

    def char_tensor(self, string):  # char to vector of size {n_characters}
        tensor = torch.zeros(len(string)).long()
        for i in range(len(string)):
            tensor[i] = all_characters.index(string[i])
        return tensor

    def get_random_batch(self):  # get {self.chunk_len} chars in a batch
        text_input = torch.zeros(self.batch_size, self.chunk_len)
        text_target = torch.zeros(self.batch_size, self.chunk_len)

        for i in range(self.batch_size):
            start_idx = random.randint(0, len(file) - self.chunk_len)
            end_idx = start_idx + self.chunk_len + 1
            text_str = file[start_idx:end_idx]

            text_input[i, :] = self.char_tensor(text_str[:-1])
            text_target[i, :] = self.char_tensor(text_str[1:])

        return text_input.long(), text_target.long()

    # generate some names
    def generate(self, initial_str='Ab', prediction_len=100, temperature=0.85):
        hidden, cell = self.rnn.init_hidden(1)
        initial_input = self.char_tensor(initial_str)
        predicted = initial_str

        for i in range(len(initial_str)-1):
            _, (hidden, cell) = self.rnn(initial_input[i].view(1).to(device),
                                         hidden, cell)
        last_char = initial_input[-1]

        for i in range(prediction_len):
            output, (hidden, cell) = self.rnn(last_char.view(1).to(device),
                                         hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)

        return predicted

    def train(self):  # train RNN
        self.rnn = RNN(input_size=n_characters,
                       hidden_size=self.hidden_size,
                       num_layers=self.num_layers,
                       output_size=n_characters).to(device)
        optimizer = torch.optim.Adam(self.rnn.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        writer = SummaryWriter(log_dir)  # for TensorBoard
        print('=> Starting training')

        for epoch in range(1, self.num_epochs+1):
            input, target = self.get_random_batch()
            hidden, cell = self.rnn.init_hidden(self.batch_size)
            input = input.to(device)
            target = target.to(device)
            loss = 0

            for i in range(self.chunk_len):
                output, (hidden, cell) = self.rnn(input[:, i], hidden, cell)
                loss += criterion(output, target[:, i])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss = loss.item() / self.chunk_len

            if epoch % self.print_every == 0:
                print(f'loss: {loss}')
                print(self.generate())

            writer.add_scalar('Training loss', loss, global_step=epoch)

In [None]:
gennames = Generator()
gennames.train()

=> Starting training
loss: 1.9460123291015625
Abdra
Brastin
Decina
Doma
Tiell
Elis
Edwend
Leara
Alia
Olonte
Reanna
Denala
Mimmy
Avenney
Bronnith
Res
loss: 1.6196993408203124
Ablece
Derine
Catheris
Sherice
Walen
Denabette
Bernardine
Takalie
Deannol
Nicholas
Jeona
Wolla
Shanna

loss: 1.45254443359375
Abralian
Brendan
Grewvin
Dony
Derry
Candyn
Jean
Joshan
Junilyn
Julio
Tony
Tommy
Matmy
Rita
Terrence
Em
loss: 1.332224609375
Abel
Angela
Aranna
Austina
Elma
Cemarline
Denise
Dyanna
Amarie
Alexandra
Eleza
Florah
Francince
Gabrie
loss: 1.189349853515625
Abriella
Ann
Oliver
Angelica
Abriana
Auzie
Anahi
Baby
Elisa
Carlie
Carolyn
Joel
Kristina
Lakea
Lucy
Ro
loss: 1.1640946044921876
Abra
Caroline
Christen
Crystal
Darin
Danielle
Dornes
Ellino
Erin
Hannah
Jessie
Josephine
Kathy
Leonar

loss: 1.170986083984375
Abeth
Alexandra
Aurelia
Beatrice
Brea
Brittney
Carli
Carolyne
Cherie
Christin
Christina
Charlotte
Cori
loss: 1.1580269775390626
Abraley
Adelia
Alisha
Alan
Aleen
Avry
Allison
Ariza
Ashley
Aniyah
