Based on the tutorial by Sean Robertson <a href="https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html">here</a>.

In [1]:
import glob
import os
import random
import string
import unicodedata
from io import open

import torch
import torch.nn as nn

In [2]:
DATA_DIR = '../../data'

In [3]:
all_letters = string.ascii_letters + " .,;'-'"
n_letters = len(all_letters) + 1 # + 1 for EOS

In [4]:
def find_files(path):
    return glob.glob(path)

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) 
                   if unicodedata.category(c) != 'Mn' and c in all_letters)

In [6]:
def read_lines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

In [7]:
category_lines = {}
all_categories = []
for filename in find_files('%s/names/*.txt' % DATA_DIR):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines
n_categories = len(all_categories)
if n_categories == 0:
    raise RuntimeError('Data not found')
print(f'# categories: {n_categories} {all_categories}')
print(unicode_to_ascii("O'Néàl"))

# categories: 18 ['Czech', 'German', 'Arabic', 'Japanese', 'Chinese', 'Vietnamese', 'Russian', 'French', 'Irish', 'English', 'Spanish', 'Greek', 'Italian', 'Portuguese', 'Scottish', 'Dutch', 'Korean', 'Polish']
O'Neal


In [8]:
DROPOUT = 0.1

In [9]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(n_categories + input_size + hidden_size, 
                             hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size,
                             output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(DROPOUT)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2h(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [10]:
def random_choice(lst):
    return lst[random.randint(0, len(lst) - 1)]

In [17]:
# Get random category and line from category
def get_random_training_pair():
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    return category, line

In [18]:
def make_category_tensor(category):
    '''One-hot vec for category'''
    idxs = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][idxs] = 1
    return tensor

In [19]:
def make_input_tensor(line):
    '''One-hot matrix of first to last letters (excluding EOS)'''
    tensor = torch.zeros(len(line), 1, n_letters)
    for c in range(len(line)):
        letter = line[c]
        tensor[c][0][all_letters.find(letter)] = 1
    return tensor

In [21]:
def make_target_tensor(line):
    '''LongTensor of second letter to end (EOS) for target'''
    letter_idxs = [all_letters.find(line[c]) for c in range(1, len(line))]
    letter_idxs.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_idxs)

In [22]:
def random_training_example():
    '''
    Make category, input, and target tensors from a random category and
    line pair
    '''
    category, line = random_training_pair()
    category_tensor = make_category_tensor(category)
    input_line_tensor = make_input_tensor(line)
    target_line_tensor = make_target_tensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

# Training the Network