In [1]:
# data: https://download.pytorch.org/tutorial/data.zip
import torch.nn as nn
import torch
from pdb import set_trace
import io
import os
import unicodedata
import string
import glob
import random
import mlflow

In [2]:
# alphabet small + capital letters + " .,;'"
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

In [3]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )

In [4]:
def load_data():
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
    
    for filename in find_files('names/*.txt'):
        from pdb import set_trace
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories

In [5]:
"""
To represent a single letter, we use a “one-hot vector” of 
size <1 x n_letters>. A one-hot vector is filled with 0s
except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.

To make a word we join a bunch of those into a
2D matrix <line_length x 1 x n_letters>.

That extra 1 dimension is because PyTorch assumes
everything is in batches - we’re just using a batch size of 1 here.
"""

'\nTo represent a single letter, we use a “one-hot vector” of \nsize <1 x n_letters>. A one-hot vector is filled with 0s\nexcept for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.\n\nTo make a word we join a bunch of those into a\n2D matrix <line_length x 1 x n_letters>.\n\nThat extra 1 dimension is because PyTorch assumes\neverything is in batches - we’re just using a batch size of 1 here.\n'

In [6]:
# Find letter index from all_letters, e.g. "a" = 0
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

In [7]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

In [8]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

In [9]:
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor

In [10]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_tensor, hidden_state):
        # LSTM expects input in the shape (batch, seq_len, input_size)
        # Reshape input_tensor to include batch and seq_len if needed
        input_tensor = input_tensor.unsqueeze(1)  # Adding seq_len=1 dimension
        output, (hidden, cell) = self.lstm(input_tensor, hidden_state)
        
        # Pass the LSTM's hidden state through the output layer
        output = self.fc(output.squeeze(1))  # Remove the seq_len dimension
        return output, (hidden, cell)
    
    def init_hidden(self):
        # Initialize both hidden and cell states as zeros
        return (torch.zeros(1, 1, self.hidden_size),  # Hidden state
                torch.zeros(1, 1, self.hidden_size))  # Cell state


In [11]:
category_lines, all_categories = load_data()
n_categories = len(all_categories)

In [12]:
def category_from_output(output):
    category_idx = torch.argmax(output).item()
    return all_categories[category_idx]

In [13]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
mdl  = LSTM(57, 100, 1)
optimizer = torch.optim.Adam(mdl.parameters(), lr=learning_rate)

def train(line_tensor, category_tensor):
    hidden = mdl.init_hidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
        
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return output, loss.item()

## Make it better

In [14]:
import os

def read_lines_from_files(directory):
    results = []  # To store the tuples
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    line_content = line.strip()  # Remove leading/trailing whitespace
                    results.append((line_content, filename.replace('.txt', '')))
    return results

directory_path = 'names'
lines_with_files = read_lines_from_files(directory_path)

In [27]:
type(lines_with_files)

list

In [15]:
class NamesDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name, language = self.data[idx]
        return name, language

In [16]:
from torch.utils.data import DataLoader, Dataset

In [17]:
ds = NamesDataset(lines_with_files)

In [18]:
dl = DataLoader(ds, batch_size=1, shuffle=True)

In [19]:
n = NamesDataset(lines_with_files)

In [20]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_tensor, hidden_state):
        # LSTM expects input in the shape (batch, seq_len, input_size)
        # Reshape input_tensor to include batch and seq_len if needed
        input_tensor = input_tensor.unsqueeze(1)  # Adding seq_len=1 dimension
        output, (hidden, cell) = self.lstm(input_tensor, hidden_state)
        
        # Pass the LSTM's hidden state through the output layer
        output = self.fc(output.squeeze(1))  # Remove the seq_len dimension
        return output, (hidden, cell)
    
    def init_hidden(self):
        # Initialize both hidden and cell states as zeros
        return (torch.zeros(1, 1, self.hidden_size),  # Hidden state
                torch.zeros(1, 1, self.hidden_size))  # Cell state


In [25]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
mdl  = LSTM(57, 256, 18)
optimizer = torch.optim.Adam(mdl.parameters(), lr=learning_rate)

def train(line_tensor, category_tensor, hidden):
    for i in range(line_tensor.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
    # set_trace()
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return output, loss.item()

In [26]:
%%time

exp = mlflow.set_experiment(experiment_name='lstm')

with mlflow.start_run(experiment_id=exp.experiment_id, run_name='hidden_256'):
    j = 0
    accurate = 0
    wrong = 0
    current_loss = 0
    all_losses = []
    plot_steps, print_steps = 1000, 5000
    num_epochs = 1
    for i in range(num_epochs):
        hidden = mdl.init_hidden()
        for name, language in dl:
            language = language[0]
            name = name[0]
            category_tensor = torch.tensor([all_categories.index(language)], dtype=torch.long)
            line_tensor = line_to_tensor(name)
            output, loss = train(line_tensor, category_tensor, hidden)
            current_loss += loss 

            guess = category_from_output(output)
            correct = "CORRECT" if guess == language else f"WRONG ({language})"
            
            if guess == language:
                accurate +=1
            else:
                wrong +=1
            
            j+=1
            if (j+1) % plot_steps == 0:
                print('name, guess and label', name, guess, language)
                if guess == language:
                    print('CORRECT')
                else:
                    print('WRONG')
                # print(f"{i+1} {loss:.4f} {line} / {guess} {correct}")
    # mlflow.log_param('accuracy', accurate)
    # mlflow.log_param('wrong', wrong)
    accuracy = accurate / (accurate + wrong)
    mlflow.log_param('accuracy', accuracy)


name, guess and label Ugaki Russian Japanese
WRONG
name, guess and label Shichirobei Japanese Japanese
CORRECT
name, guess and label Sanchez English Spanish
WRONG
name, guess and label Garrett English English
CORRECT
name, guess and label Shalnov Russian Russian
CORRECT
name, guess and label Bazzi Japanese Arabic
WRONG
name, guess and label Vyucheisky Russian Russian
CORRECT
name, guess and label Gianakopulos Russian Greek
WRONG
name, guess and label Usynin Russian Russian
CORRECT
name, guess and label Chikhanchin Russian Russian
CORRECT
name, guess and label Inson English English
CORRECT
name, guess and label Damhan Russian Irish
WRONG
name, guess and label Marmazov Russian Russian
CORRECT
name, guess and label Elcock English English
CORRECT
name, guess and label Taidhg Russian Irish
WRONG
name, guess and label Bagretsoff Russian Russian
CORRECT
name, guess and label Rainbagin Russian Russian
CORRECT
name, guess and label Mawson English English
CORRECT
name, guess and label Kirkbright

In [171]:
accurate, wrong

(15938, 4136)

In [173]:
# plt.plot(all_losses)

In [175]:
output, _ = mdl(line_tensor[i], hidden)
guess = category_from_output(output)
guess

'Chinese'

In [56]:
def predict(input_line):
    with torch.no_grad():
        line_tensor = line_to_tensor(input_line)
            
        for i in range(line_tensor.size()[0]):
            output, _ = mdl(line_tensor[i], hidden)
            guess = category_from_output(output)
            guess
        
        return guess

In [143]:
# def predict(input_line):
#     with torch.no_grad():
#         line_tensor = line_to_tensor(input_line)
            
#         for i in range(line_tensor.size()[0]):
#             output, hidden = mdl(line_tensor[i], hidden)
        
#         guess = category_from_output(output)
#         return guess

In [57]:
names = [
    "Ritchie",
    "Rozinek",
    "Ruba",
    "Ruda",
    "Rumisek",
    "Ruzicka",
    "Rypka",
    "Rebka",
    "Rzehak",
    "Sabol",
    "Safko",
    "Samz",
    "Sankovsky",
    "Sappe",
    "Sappe",
    "Sarna",
    "Satorie",
    "Savchak",
    "Svotak",
    "Swatchak",
    "Svocak",
    "Svotchak",
    "Schallom",
    "Schenk",
    "Schlantz",
    "Schmeiser",
    "Schneider",
    "Schmied",
    "Schubert",
    "Schwarz",
    "Schwartz",
    "Sedmik"
]


In [58]:
arabic_names = [
    "Gaber",
    "Haddad",
    "Rahal",
    "Koury",
    "Harb",
    "Mikhail",
    "Dagher",
    "Shadid",
    "Boutros",
    "Mikhail",
    "Khouri",
    "Nader",
    "Issa",
    "Harb",
    "Dagher",
    "Gerges",
    "Morcos",
    "Essa",
    "Fakhoury",
    "Tuma",
    "Kattan",
    "Totah",
    "Qureshi",
    "Nahas",
    "Bitar",
    "Tahan",
    "Daher",
    "Shammas",
    "Kouri",
    "Ganim",
    "Daher",
    "Awad",
    "Malouf"
]


In [60]:
english_names = [
    "Yallop",
    "Yang",
    "Yapp",
    "Yard",
    "Yardley",
    "Yarker",
    "Yarlett",
    "Yarnall",
    "Yarnold",
    "Yarwood",
    "Yasmin",
    "Yates",
    "Yeadon",
    "Yeardley",
    "Yeardsley",
    "Yeates",
    "Yeatman",
    "Yeldon",
    "Yeoman",
    "Yeomans",
    "Yetman",
    "Yeung",
    "Yoman",
    "Yomkins",
    "York",
    "Yorke",
    "Yorston",
    "Youlden",
    "Young",
    "Younge",
    "Younis",
    "Youssouf",
    "Yule",
    "Yusuf",
    "Zaoui",
]

In [64]:
russain_names = [
    "Umko",
    "Tumolsky",
    "Tumov",
    "Tumunbayarov",
    "Tundykov",
    "Tuneev",
    "Tunev",
    "Tungusov",
    "Tuniev",
    "Tunik",
    "Tunkin",
    "Tunnikov",
    "Tupalo",
    "Tupihin",
    "Tupikhin",
    "Tupikin",
    "Tupikov",
    "Tupolev",
    "Tuporshin",
    "Tur",
    "Turaev",
    "Turanov",
    "Turarov",
    "Turashev",
    "Turatbekov",
    "Turbai",
    "Turbanov",
    "Turbin",
    "Turchak",
    "Turchaninov",
    "Turchenko",
    "Turchin",
    "Turetskov",
    "Turetsky",
    "Turgenev",
    "Turik",
    "Turintsev",
    "Turischev",
    "Turiyansky",
    "Turkestanov",
    "Turkevich",
    "Turkin",
    "Turko",
    "Turkov",
    "Turkul",
    "Turlak",
    "Turlapov",
    "Turlov",
    "Turmanov",
    "Turmilov",
    "Turmov",
    "Turno",
    "Turov",
    "Turoverov",
    "Turovsky",
    "Turovtsev",
    "Turpaev",
    "Turpyatko",
    "Tursky",
    "Tursunov",
    "Turta",
    "Turtsevich",
    "Turtygin",
    "Turubanov",
    "Turuhin",
    "Turukhin",
    "Turulo",
    "Turunov",
    "Turupanov",
    "Turushev",
    "Turusin",
    "Turusov",
    "Turutin",
    "Turyanov",
    "Turyansky",
    "Tuvin",
    "Tuzin",
]

In [68]:
true = 0
wrong = 0
for name in russain_names:
    res = predict(name)
    if res == "Russian":
        true +=1
    else:
        wrong +=1
acc = true / (true + wrong)
acc

0.5324675324675324

In [181]:
predict('Wasem')

'English'

In [None]:
# while True:
#     sentence = input("Input:")
#     if sentence == "quit":
#         break
    
#     predict(sentence)

Antonopoulos
Antonopoulos
Arvanitoyannis
Avgerinos
Banos
Batsakis
Bekyros
Belesis
Bertsimas
Bilias
Blades
Bouloukos
Brisimitzaki