# Classifying names with RNN

Classify few thousand surnames from 18 languages and predict the language based on the spelling

Data: https://download.pytorch.org/tutorial/data.zip

In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)

print(f"[DEVICE] {torch.get_default_device()}")

[DEVICE] cuda:0


## Preparing the data

In [3]:
import string
import unicodedata

allowed_characters = string.ascii_letters + ".,;'" + "_"
n_letters = len(allowed_characters)

# convert to unicode to ascii to limit the RNN input parameters
# eg. using one-hot encoding you need 128 parameters with ASCII
# while thousands parameters with Unicode
# So, the RNN training is faster and simpler
def unicodeToAscii(s):
    return ''.join(
        # decompose the combined chars
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )


In [4]:
word_test = "Ślusàrski"
print(f"[UNICODE] {word_test}")
print(f"[ASCII] {unicodeToAscii(word_test)}")


[UNICODE] Ślusàrski
[ASCII] Slusarski


# Name to tensor
Using one hot encoding, we represent chars as tensors, so tensors will have the size of the vocabulary

In [18]:
# get the letter index
def letterToIndex(letter):
    # if letter in unknown
    if letter not in allowed_characters:
        return allowed_characters.find('_')
    else:
        return allowed_characters.find(letter)

# word to tensor
def lineToTensor(line):
    # RNN, GRU and LTSM have this expected input shape
    # (seq_len, batch, input_size), where:
    # seq_len: number of steps in the network, 1 step is 1 letter
    # batch: sequences in parallel, 1 is for 1 letter at a time
    # input_size: size of vocabulary
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [19]:
print(f"[TEST] a:")
print(lineToTensor("a"))

print(f"[TEST] Ahn:")
print(lineToTensor("Ahn"))

[TEST] a:
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]], device='cuda:0')
[TEST] Ahn:
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

## Prepare data

In [23]:
from io import open
import glob
import os
import time

import torch
from torch.utils.data import Dataset

class NamesDataset(Dataset):
    def __init__(self, data_dir):
        # metadata
        self.data_dir = data_dir
        self.load_time = time.localtime
        # set of all languages
        labels_set = set()
        # Names and languages by strings and tensors
        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []
        # read data from .txt
        text_files = glob.glob(os.path.join(data_dir, "*.txt"))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding="utf-8").read().strip().split("\n")
            for name in lines:
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)
        # labels to tensor
        self.labels_uniq = list(labels_set)
        for idx in range(len(self.labels)):
            # the tensor of the label is its index in the list of unique labels
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
            self.labels_tensors.append(temp_tensor)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_label = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]

        return label_tensor, data_tensor, data_label, data_item

In [24]:
alldata = NamesDataset("./data/names")
print(f"[DATA] Loaded {len(alldata)} names")
print(f"[EXAMPLE]")
print(alldata[0])

[DATA] Loaded 20074 names
[EXAMPLE]
(tensor([17], device='cuda:0'), tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 1.,

In [25]:
# here we are using Generator to set the device as gpu
train_set, test_set = torch.utils.data.random_split(
    alldata,
    [0.85, 0.15],
    generator=torch.Generator(device=device).manual_seed(2025)
)

print(f"[TRAIN] {len(train_set)} examples")
print(f"[TEST] {len(test_set)} examples")


[TRAIN] 17063 examples
[TEST] 3011 examples


## Create the network