In [1]:
import numpy as np
import pandas as pd
import glob
import unicodedata
import string
import os
import time
from io import open

import torch
import torch.nn as nn

In [9]:
def unicode_to_ascii(s):
    """Unicode string to plain ASCII.
    
    """
    decoded = "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
        and c in all_letters
    )
    return decoded


def read_lines(filename):
    """Read a file and split into lines.
    
    """
    lines = open(filename, encoding="utf-8").read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]


def random_ex(train=True):
    """Select a random training example and return category/line contents.
    
    """
    category = np.random.choice(all_categories)
    # 0 = train, 1 = val
    if train:
        line_ps = (1 - category_lines_tv_split[category]) / (1 - category_lines_tv_split[category]).sum()
    else:
        line_ps = category_lines_tv_split[category] / category_lines_tv_split[category].sum()
    
    line = np.random.choice(category_lines[category], p=line_ps)
    return category, line


def category_tensor(category):
    """Get 1-hot encoding for a category 
    
    """
    cat_i = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][cat_i] = 1
    return tensor

In [3]:
name_files = glob.glob("../data/rnn_char_class_data/names/*.txt")
all_letters = string.ascii_letters + " .,;'-"
# +1 is for EOS marker
n_letters = len(all_letters) + 1

In [4]:
# category_lines dict. -- a list of lines per category
category_lines = {}
all_categories = []

for filename in name_files:
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)
category_lines_tv_split = {k: np.random.choice((0, 1), p=(0.8, 0.2), size=len(v)) for k, v in category_lines.items()}

In [5]:
category_lines_tv_split["Korean"]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0])

In [6]:
print('# categories:', n_categories, all_categories)
print(unicode_to_ascii("O'Néàl"))

# categories: 18 ['Korean', 'Vietnamese', 'Czech', 'French', 'Greek', 'Dutch', 'Irish', 'Japanese', 'Portuguese', 'Polish', 'Spanish', 'Scottish', 'German', 'Russian', 'English', 'Arabic', 'Chinese', 'Italian']
O'Neal


In [7]:
class RNN(nn.Module):
    def __init__(self, category_size, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.h1_fc = nn.Linear(category_size + input_size + hidden_size, hidden_size)
        self.y1_fc = nn.Linear(category_size + input_size + hidden_size, output_size)
        self.y2_fc = nn.Linear(hidden_size + output_size, output_size)
        
        self.tanh = nn.Tanh()
        self.log_softmax = nn.LogSoftmax(dim=1)
                
    def forward(self, c, x, h0):
        cxh0 = torch.cat((c, x, h0), dim=1)
        
        h1 = self.h1_fc(cxh0)
        h1 = self.tanh(h1)
        
        y = self.y1_fc(cxh0)
        y = self.tanh(y)
        
        yh1 = torch.cat((y, h1), dim=1)
        y = self.y2_fc(yh1)
        y = self.log_softmax(y)
        
        return y, h1
    
    def zero_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [12]:
random_ex(train=True)

('English', 'Cliff')

In [13]:
category_tensor("Japanese")

tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])