In [1]:
import glob
import os
import unicodedata
import string
import torch
import torch.nn as nn
import random

In [2]:
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 

def unicode_to_ascii(name):
    return ''.join(
        c for c in unicodedata.normalize('NFD', name)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [3]:
lang_names = dict()

for filename in glob.glob('data/names/*.txt'):
    lang = os.path.basename(filename).strip('.txt')
    names = open(filename).read().strip().split('\n')
    names = [unicode_to_ascii(name) for name in names]
    lang_names[lang] = names
    
all_langs = list(lang_names.keys())
n_langs = len(all_langs)

In [53]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(n_langs + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_langs + input_size + hidden_size, output_size)
        
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        
        self.dropout = nn.Dropout(0,1)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, lang, input, hidden):
        input_combined = torch.cat((lang, input, hidden), 1)
        
        hidden = self.i2h(input_combined)
        
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output),1)
        
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [54]:
# One-hot tensors

def make_lang_tensor(lang):
    i = all_langs.index(lang)
    tensor = torch.zeros(1,n_langs)
    tensor[0][i] = 1
    return tensor
    
def make_input_tensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for i in range(len(name)):
        letter = name[i]
        tensor[i][0][all_letters.find(letter)] = 1
    return tensor

def make_target_tensor(name):
    letter_indexes = [all_letters.find(name[i]) for i in range(1, len(name))]
    letter_indexes.append(n_letters - 1)
    return torch.LongTensor(letter_indexes)


In [55]:
def random_training():
    random_lang = random.choice(all_langs)
    random_name = random.choice(lang_names[random_lang])
    
    lang_tensor = make_lang_tensor(random_lang)
    input_tensor = make_input_tensor(random_name)
    target_tensor = make_target_tensor(random_name)
    
    return lang_tensor, input_tensor, target_tensor

In [87]:
criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(lang_tensor, input_tensor, target_tensor):
    
    target_tensor.unsqueeze_(-1)
    hidden = rnn.init_hidden()
    
    rnn.zero_grad()
    
    loss = 0
    
    for i in range(input_tensor.size(0)):
        output, hidden = rnn(lang_tensor, input_tensor[i], hidden)
        l = criterion(output, target_tensor[i])
        loss += l

    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    
    return output, loss.item() / input_tensor.size(0)
    

In [88]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [89]:
rnn = RNN(n_letters, 128, n_letters)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*random_training())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

0m 9s (5000 5%) 2.5079
0m 19s (10000 10%) 2.4134
0m 28s (15000 15%) 2.3434
0m 37s (20000 20%) 1.8466
0m 46s (25000 25%) 2.1607
0m 55s (30000 30%) 2.1970
1m 5s (35000 35%) 2.7081
1m 16s (40000 40%) 2.4509
1m 26s (45000 45%) 2.8063
1m 35s (50000 50%) 3.3423
1m 45s (55000 55%) 2.0338
1m 54s (60000 60%) 2.7576
2m 4s (65000 65%) 3.1084
2m 13s (70000 70%) 2.4004
2m 23s (75000 75%) 2.4036
2m 34s (80000 80%) 2.2284
2m 44s (85000 85%) 1.2419
2m 53s (90000 90%) 2.0227
3m 3s (95000 95%) 2.2170
3m 13s (100000 100%) 2.2876


In [90]:
max_length = 20

In [110]:
def sample(lang, start_letter='A'):
    with torch.no_grad():
        lang_tensor = make_lang_tensor(lang)
        input_tensor = make_input_tensor(start_letter)
        hidden = rnn.init_hidden()
        
        output_name = start_letter
        
        for i in range(max_length):
            output, hidden = rnn(lang_tensor, input_tensor[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            
            if topi == n_letters - 1:
                break
            
            else:
                letter = all_letters[topi]
                output_name += letter
            
            input_tensor = make_input_tensor(letter)
        
        return output_name
            

In [113]:
sample('Irish', 'G')

'Gangan'

In [95]:
print(all_langs)

['Portuguese', 'English', 'French', 'Korean', 'Vietnamese', 'Russian', 'Arabic', 'Dutch', 'Japanese', 'Scottish', 'Chinese', 'Greek', 'Spanish', 'Italian', 'German', 'Irish', 'Czech', 'Polish']
