In [1]:
import numpy as np
import glob
import unicodedata
import string
import os
from io import open

import torch
import torch.nn as nn

In [2]:
def unicode_to_ascii(s):
    """Unicode string to plain ASCII.
    
    """
    decoded = "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
        and c in all_letters
    )
    return decoded


def read_lines(filename):
    """Read a file and split into lines.
    
    """
    lines = open(filename, encoding="utf-8").read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]


def letter_to_index(letter):
    """Find letter index from all_letters.
    
    """
    return all_letters.find(letter)


def letter_to_tensor(letter):
    """Create 1-hot encoding of letters.
    
    """
    tensor = torch.zeros(1, n_letters)
    tensor[0][letter_to_index(letter)] = 1
    return tensor


def line_to_tensor(line):
    """Convert line of text into (line, 1, n_letters) tensor.
    
    Extra dimension in middle is so a lines can be passed into
    network 1-by-1 as a 2D vector of size (1, n_letters). 
    
    """
    tensor = torch.zeros(len(line), 1, n_letters)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor


def cat_from_output(output):
    """Get category by finding argmax of output.
    
    """
    _, max_i = output.max(1)
    category_i = max_i.item()
    return all_categories[category_i], category_i

In [3]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.h_t1_fc = nn.Linear(input_size + hidden_size, hidden_size)
        self.y_t1_fc = nn.Linear(hidden_size, output_size)
        
        self.tanh = nn.Tanh()
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, h_t0):
        xh_t0 = torch.cat((x, h_t0), 1)
        
        h_t1 = self.h_t1_fc(xh_t0)
        h_t1 = self.tanh(h_t1)
        
        y = self.y_t1_fc(h_t1)
        y = self.log_softmax(y)
        
        return y, h_t1
    
    def zero_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [4]:
name_files = glob.glob("../data/rnn_char_class_data/names/*.txt")
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [5]:
print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [6]:
# category_lines dict. -- a list of names per language
category_lines = {}
all_categories = []

for filename in name_files:
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [7]:
print(
    np.mean([len(v) for k, v in category_lines.items()])
)

print(
    np.sum([len(v) for k, v in category_lines.items()])
)

print(category_lines["Italian"][:5])

1115.2222222222222
20074
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


In [8]:
print(letter_to_tensor("J"))
print(line_to_tensor("Jones").size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [9]:
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [10]:
# Single-character tensor
with torch.no_grad():
    x = letter_to_tensor("A")
    h_0 = rnn.zero_hidden()
    y, h_1 = rnn(x, h_0)

In [11]:
print(y)

tensor([[-2.8451, -2.8543, -2.8373, -3.0047, -2.8773, -2.9277, -2.9962, -2.8439,
         -2.9241, -2.8222, -2.8192, -2.8884, -2.9292, -2.9389, -2.9840, -2.8788,
         -2.8788, -2.8085]])


In [12]:
# Full line tensor
with torch.no_grad():
    x = line_to_tensor("Albert")
    h_0 = rnn.zero_hidden()
    y, h1 = rnn(x[0], h_0)

In [13]:
print(y)

tensor([[-2.8451, -2.8543, -2.8373, -3.0047, -2.8773, -2.9277, -2.9962, -2.8439,
         -2.9241, -2.8222, -2.8192, -2.8884, -2.9292, -2.9389, -2.9840, -2.8788,
         -2.8788, -2.8085]])


In [14]:
print(cat_from_output(y))

('Italian', 17)


In [28]:
def random_training_ex():
    category = np.random.choice(all_categories)
    line = np.random.choice(category_lines[category])
    category_tensor = torch.tensor(
        [all_categories.index(category)],
        dtype=torch.long
    )
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor

In [29]:
random_training_ex()

('Korean',
 'Ma',
 tensor([0]),
 tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0.]],
 
         [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0.]]]))

In [49]:
for i in range(10):
    category, line, category_tensor, line_tensor = random_training_ex()
    print(f"Category: {category} / Line: {line}")

Category: Vietnamese / Line: Vo
Category: Korean / Line: Gu
Category: Korean / Line: Chin
Category: Korean / Line: Yun
Category: Arabic / Line: Isa
Category: Spanish / Line: Gutierrez
Category: Czech / Line: Kenzel
Category: Korean / Line: San
Category: Scottish / Line: Wright
Category: English / Line: Dunne
