In [1]:
# data: https://download.pytorch.org/tutorial/data.zip
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch
from pdb import set_trace
import io
import os
import unicodedata
import string
import glob
import matplotlib.pyplot as plt
import random

In [2]:
# alphabet small + capital letters + " .,;'"
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

In [3]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )

In [4]:
def load_data():
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
    
    for filename in find_files('names/*.txt'):
        from pdb import set_trace
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories

In [5]:
"""
To represent a single letter, we use a “one-hot vector” of 
size <1 x n_letters>. A one-hot vector is filled with 0s
except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.

To make a word we join a bunch of those into a
2D matrix <line_length x 1 x n_letters>.

That extra 1 dimension is because PyTorch assumes
everything is in batches - we’re just using a batch size of 1 here.
"""

'\nTo represent a single letter, we use a “one-hot vector” of \nsize <1 x n_letters>. A one-hot vector is filled with 0s\nexcept for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.\n\nTo make a word we join a bunch of those into a\n2D matrix <line_length x 1 x n_letters>.\n\nThat extra 1 dimension is because PyTorch assumes\neverything is in batches - we’re just using a batch size of 1 here.\n'

In [6]:
# Find letter index from all_letters, e.g. "a" = 0
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

In [7]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

In [8]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

In [9]:
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor

In [10]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_tensor, hidden_state):
        # LSTM expects input in the shape (batch, seq_len, input_size)
        # Reshape input_tensor to include batch and seq_len if needed
        input_tensor = input_tensor.unsqueeze(1)  # Adding seq_len=1 dimension
        output, (hidden, cell) = self.lstm(input_tensor, hidden_state)
        
        # Pass the LSTM's hidden state through the output layer
        output = self.fc(output.squeeze(1))  # Remove the seq_len dimension
        return output, (hidden, cell)
    
    def init_hidden(self):
        # Initialize both hidden and cell states as zeros
        return (torch.zeros(1, 1, self.hidden_size),  # Hidden state
                torch.zeros(1, 1, self.hidden_size))  # Cell state


In [11]:
category_lines, all_categories = load_data()
n_categories = len(all_categories)

In [12]:
def category_from_output(output):
    category_idx = torch.argmax(output).item()
    return all_categories[category_idx]

In [13]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
mdl  = LSTM(57, 100, 1)
optimizer = torch.optim.Adam(mdl.parameters(), lr=learning_rate)

In [14]:
def train(line_tensor, category_tensor):
    hidden = mdl.init_hidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
        
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return output, loss.item()

## Make it better

In [17]:
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]

In [18]:
def read_lines_from_files(directory):
    results = []  # To store the tuples
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    line_content = line.strip()  # Remove leading/trailing whitespace
                    results.append((line_content, filename.replace('.txt', '')))
    return results

In [19]:
directory_path = 'nlp/names'
lines_with_files = read_lines_from_files(directory_path)

In [20]:
class NamesDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name, language = self.data[idx]
        return name, language

In [21]:
for name, lang in lines_with_files:
    if lang == 'Russian':
        print(lang)
        break

In [22]:
ds = NamesDataset(lines_with_files)

In [43]:
dl = DataLoader(ds, batch_size=64, shuffle=True)

In [44]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_tensor, hidden_state):
        # LSTM expects input in the shape (batch, seq_len, input_size)
        # Reshape input_tensor to include batch and seq_len
        input_tensor = input_tensor.unsqueeze(1)
        output, (hidden, cell) = self.lstm(input_tensor, hidden_state)
        
        # Pass the LSTM's hidden state through the output layer
        output = self.fc(output.squeeze(1))  # Remove the seq_len dimension
        return output, (hidden, cell)
    
    def init_hidden(self):
        # Initialize both hidden and cell states as zeros
        return (torch.zeros(1, 1, self.hidden_size),  # Hidden state
                torch.zeros(1, 1, self.hidden_size))  # Cell state

In [61]:
num_of_letters = len(ALL_LETTERS)

In [62]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
mdl = LSTM(num_of_letters, 100, 18)
optimizer = torch.optim.Adam(mdl.parameters(), lr=learning_rate)

In [74]:
def train(line_tensor, category_tensor, hidden):
    print(line_tensor[0])
    raise Exception('a')
    for i in range(line_tensor.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return output, loss.item()

In [79]:
%%time
j = 0
accurate = 0
wrong = 0
current_loss = 0
all_losses = []
plot_steps = 500
num_epochs = 10
for i in range(num_epochs):
    hidden = mdl.init_hidden()
    for name, language in dl:
        language = language[0]
        name = name[0]
        print(name)
        category_tensor = torch.tensor([all_categories.index(language)], dtype=torch.long)
        line_tensor = line_to_tensor(name)
        output, loss = train(line_tensor, category_tensor, hidden)
        current_loss += loss 

        guess = category_from_output(output)
        correct = "CORRECT" if guess == language else f"WRONG ({language})"
        
        if guess == language:
            accurate +=1
        else:
            wrong +=1    
        
    if (i+1) % plot_steps == 0:
        all_losses.append(current_loss / plot_steps)
        current_loss = 0
        j+=1
        if (j+1) % plot_steps == 100:
            print(correct)
            print('name, guess and label', name, guess, language)
            print('loss', loss)
            # print(f"{i+1} {loss:.4f} {line} / {guess} {correct}")
        # raise Exception()

Inao
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


Exception: a

In [73]:
accurate, wrong

(611, 489)

In [70]:
accurate / (accurate + wrong)

0.47363636363636363

In [154]:
def predict(input_line):
    with torch.no_grad():
        line_tensor = line_to_tensor(input_line)
            
        for i in range(line_tensor.size()[0]):
            output, _ = mdl(line_tensor[i], hidden)
            guess = category_from_output(output)
            guess
        
        return guess

In [227]:
def predict(input_line):
    for i in range(input_line.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
        return output


In [228]:
arabic_names = [
    "Gaber",
    "Haddad",
    "Rahal",
    "Koury",
    "Harb",
    "Mikhail",
    "Dagher",
    "Shadid",
    "Boutros",
    "Mikhail",
    "Khouri",
    "Nader",
    "Issa",
    "Harb",
    "Dagher",
    "Gerges",
    "Morcos",
    "Essa",
    "Fakhoury",
    "Tuma",
    "Kattan",
    "Totah",
    "Qureshi",
    "Nahas",
    "Bitar",
    "Tahan",
    "Daher",
    "Shammas",
    "Kouri",
    "Ganim",
    "Daher",
    "Awad",
    "Malouf"
]

In [247]:
# arabic_names = [
#     "Khoury", "Nahas", "Daher", "Gerges", "Nazari", "Maalouf", "Gerges", "Naifeh", "Guirguis", 
#     "Baba", "Sabbagh", "Attia", "Tahan", "Haddad", "Aswad", "Najjar", "Dagher", "Maloof", "Isa", 
#     "Asghar", "Nader", "Gaber", "Abboud", "Maalouf", "Zogby", "Srour", "Bahar", "Mustafa", 
#     "Hanania", "Daher", "Tuma", "Nahas", "Saliba", "Shamoon", "Handal", "Baba", "Amari", "Bahar", 
#     "Atiyeh", "Said", "Khouri", "Tahan", "Baba", "Mustafa", "Guirguis", "Sleiman", "Seif", "Dagher", 
#     "Bahar", "Gaber", "Harb", "Seif", "Asker", "Nader", "Antar", "Awad", "Srour", "Shadid", "Hajjar", 
#     "Hanania", "Kalb", "Shadid", "Bazzi", "Mustafa", "Masih", "Ghanem", "Haddad", "Isa", "Antoun", 
#     "Sarraf", "Sleiman", "Dagher", "Najjar", "Malouf", "Nahas", "Naser", "Saliba", "Shamon", "Malouf", 
#     "Kalb", "Daher", "Maalouf", "Wasem", "Kanaan", "Naifeh", "Boutros", "Moghadam", "Masih", "Sleiman", 
#     "Aswad", "Cham", "Assaf", "Quraishi", "Shalhoub", "Sabbag", "Mifsud", "Gaber", "Shammas", "Tannous", 
#     "Sleiman", "Bazzi", "Quraishi", "Rahal", "Cham", "Ghanem", "Ghanem", "Naser", "Baba", "Shamon", 
#     "Almasi", "Basara", "Quraishi", "Bata", "Wasem", "Shamoun", "Deeb", "Touma", "Asfour", "Deeb", 
#     "Hadad", "Naifeh", "Touma", "Bazzi", "Shamoun", "Nahas", "Haddad", "Arian", "Kouri", "Deeb", 
#     "Toma", "Halabi", "Nazari", "Saliba", "Fakhoury", "Hadad", "Baba", "Mansour", "Sayegh", "Antar", 
#     "Deeb", "Morcos", "Shalhoub", "Sarraf", "Amari", "Wasem", "Ganim", "Tuma", "Fakhoury", "Hadad", 
#     "Hakimi", "Nader", "Said", "Ganim", "Daher", "Ganem", "Tuma", "Boutros", "Aswad", "Sarkis", "Daher", 
#     "Toma", "Boutros", "Kanaan", "Antar", "Gerges", "Kouri", "Maroun", "Wasem", "Dagher", "Naifeh", 
#     "Bishara", "Ba", "Cham", "Kalb", "Bazzi", "Bitar", "Hadad", "Moghadam", "Sleiman", "Shamoun", 
#     "Antar", "Atiyeh", "Koury", "Nahas", "Kouri", "Maroun", "Nassar", "Sayegh", "Haik", "Ghanem", 
#     "Sayegh", "Salib", "Cham", "Bata", "Touma", "Antoun", "Antar", "Bata", "Botros", "Shammas", "Ganim", 
#     "Sleiman", "Seif", "Moghadam", "Ba", "Tannous", "Bazzi", "Seif", "Salib", "Hadad", "Quraishi", 
#     "Halabi", "Essa", "Bahar", "Kattan", "Boutros", "Nahas", "Sabbagh", "Kanaan", "Sayegh", "Said", 
#     "Botros", "Najjar", "Toma", "Bata", "Atiyeh", "Halabi", "Tannous", "Kouri", "Shamoon", "Kassis", 
#     "Haddad", "Tuma", "Mansour", "Antar", "Kassis", "Kalb", "Basara", "Rahal", "Mansour", "Handal", 
#     "Morcos", "Fakhoury", "Hadad", "Morcos", "Kouri", "Quraishi", "Almasi", "Awad", "Naifeh", "Koury", 
#     "Asker", "Maroun", "Fakhoury", "Sabbag", "Sarraf", "Shamon", "Assaf", "Boutros", "Malouf", "Nassar", 
#     "Qureshi", "Ghanem", "Srour", "Almasi", "Qureshi", "Ghannam", "Mustafa", "Najjar", "Kassab", "Shadid", 
#     "Shamoon", "Morcos", "Atiyeh", "Isa", "Ba", "Baz", "Asker", "Seif", "Asghar", "Hajjar", "Deeb", 
#     "Essa", "Qureshi", "Abboud", "Ganem", "Haddad", "Koury", "Nassar", "Abadi", "Toma", "Tannous", 
#     "Harb", "Issa", "Khouri", "Mifsud", "Kalb", "Gaber", "Ganim", "Boulos", "Samaha", "Haddad", 
#     "Sabbag", "Wasem", "Dagher", "Rahal", "Atiyeh", "Antar", "Asghar", "Mansour", "Awad", "Boulos", 
#     "Sarraf", "Deeb", "Abadi", "Nazari", "Daher", "Gerges", "Shamoon", "Gaber", "Amari", "Sarraf", 
#     "Nazari", "Saliba", "Naifeh", "Nazari", "Hakimi", "Shamon", "Abboud", "Quraishi", "Tahan", "Safar", 
#     "Hajjar", "Srour", "Gaber", "Shalhoub", "Attia", "Safar", "Said", "Ganem", "Nader", "Asghar", 
#     "Mustafa", "Said", "Antar", "Botros", "Nader", "Ghannam", "Asfour", "Tahan", "Mansour", "Attia", 
#     "Touma", "Najjar", "Kassis", "Abboud", "Bishara", "Bazzi", "Shalhoub", "Shalhoub", "Safar", "Khoury", 
#     "Nazari", "Sabbag", "Sleiman", "Atiyeh", "Kouri", "Bitar", "Zogby", "Ghanem", "Assaf", "Abadi", 
#     "Arian", "Shalhoub", "Khoury", "Morcos", "Shamon", "Wasem", "Abadi", "Antoun", "Baz", "Naser", 
#     "Assaf", "Saliba", "Nader", "Mikhail", "Naser", "Daher", "Morcos", "Awad", "Nahas", "Sarkis", 
#     "Malouf", "Mustafa", "Fakhoury", "Ghannam", "Shadid", "Gaber", "Koury", "Atiyeh", "Shamon", "Boutros", 
#     "Sarraf", "Arian", "Fakhoury", "Abadi", "Kassab", "Nahas", "Quraishi", "Mansour", "Samaha", "Wasem", 
#     "Seif", "Fakhoury", "Saliba", "Cham", "Bahar", "Shamoun", "Essa", "Shamon", "Asfour", "Bitar", 
#     "Cham", "Tahan", "Tannous", "Daher", "Khoury", "Shamon", "Bahar", "Quraishi", "Ghannam", "Kassab", 
#     "Zogby", "Basara", "Shammas", "Arian", "Sayegh", "Naifeh", "Mifsud", "Sleiman", "Arian", "Kassis", 
#     "Shamoun", "Kassis", "Harb", "Mustafa", "Boulos", "Asghar", "Shamon", "Kanaan", "Atiyeh", "Kassab", 
#     "Tahan", "Bazzi", "Kassis", "Qureshi", "Basara", "Shalhoub", "Sayegh", "Haik", "Attia", "Maroun", 
#     "Kassis", "Sarkis", "Harb", "Assaf", "Kattan", "Antar", "Sleiman", "Touma", "Sarraf", "Bazzi", 
#     "Boulos", "Baz", "Issa", "Shamon", "Shadid", "Deeb", "Sabbag", "Wasem", "Awad", "Mansour", "Saliba", 
#     "Fakhoury", "Arian", "Bishara", "Dagher", "Bishara", "Koury", "Fakhoury", "Naser", "Nader", "Antar", 
#     "Gerges", "Handal", "Hanania", "Shadid", "Gerges", "Kassis", "Essa", "Assaf", "Shadid", "Seif", 
#     "Shalhoub", "Shamoun", "Hajjar", "Baba", "Sayegh", "Mustafa", "Sabbagh", "Isa", "Najjar", "Tannous", 
#     "Hanania", "Ganem", "Gerges", "Fakhoury", "Mifsud", "Nahas", "Bishara", "Bishara", "Abadi", "Sarkis", 
#     "Masih", "Isa", "Attia", "Kalb", "Essa", "Boulos", "Basara", "Halabi", "Halabi", "Dagher", "Attia", 
#     "Kassis", "Tuma", "Gerges", "Ghannam", "Toma", "Baz", "Asghar", "Zogby", "Aswad", "Hadad", "Dagher", 
#     "Naser", "Shadid", "Atiyeh", "Zogby", "Abboud", "Tannous", "Khouri", "Atiyeh", "Ganem", "Maalouf", 
#     "Isa", "Maroun", "Issa", "Khouri", "Harb", "Nader", "Awad", "Nahas", "Said", "Baba", "Totah", "Ganim", 
#     "Handal", "Mansour", "Basara", "Malouf", "Said", "Botros", "Samaha", "Safar", "Tahan", "Botros", 
#     "Shamoun", "Handal", "Sarraf", "Malouf", "Bishara", "Aswad", "Khouri", "Baz", "Asker", "Toma", 
#     "Koury", "Gerges", "Bishara", "Boulos", "Najjar", "Aswad", "Shamon", "Kouri", "Srour", "Assaf", 
#     "Tannous", "Attia", "Mustafa", "Kattan", "Asghar", "Amari", "Shadid", "Said", "Bazzi", "Masih", 
#     "Antar", "Fakhoury", "Shadid", "Masih", "Handal", "Sarraf", "Kassis", "Salib", "Hajjar", "Totah", 
#     "Koury", "Totah", "Mustafa", "Sabbagh", "Moghadam", "Toma", "Srour", "Almasi", "Totah", "Maroun", 
#     "Kattan", "Naifeh", "Sarkis", "Mikhail", "Nazari", "Boutros", "Guirguis", "Gaber", "Kassis", "Masih", 
#     "Hanania", "Maloof", "Quraishi", "Cham", "Hadad", "Tahan", "Bitar", "Arian", "Gaber", "Baz", 
#     "Mansour", "Kalb", "Sarkis", "Attia", "Antar", "Asfour", "Said", "Essa", "Koury", "Hadad", "Tuma", 
#     "Moghadam", "Sabbagh", "Amari", "Dagher", "Srour", "Antoun", "Sleiman", "Maroun", "Tuma", "Nahas", 
#     "Hanania", "Sayegh", "Amari", "Sabbagh", "Said", "Cham", "Asker", "Nassar", "Bitar", "Said", "Dagher", 
#     "Safar", "Khouri", "Totah", "Khoury", "Salib", "Basara", "Abboud", "Baz", "Isa", "Cham", "Amari", 
#     "Mifsud", "Hadad", "Rahal", "Khoury", "Bazzi", "Basara", "Totah", "Ghannam", "Koury", "Malouf", 
#     "Zogby", "Zogby", "Boutros", "Nassar", "Handal", "Hajjar", "Maloof", "Abadi", "Maroun", "Mifsud", 
#     "Kalb", "Amari", "Hakimi", "Boutros", "Masih", "Kattan", "Haddad", "Arian", "Nazari", "Assaf", 
#     "Attia", "Wasem", "Gerges", "Asker", "Tahan", "Fakhoury", "Shadid", "Sarraf", "Attia", "Naifeh", 
#     "Aswad", "Deeb", "Tannous", "Totah", "Cham", "Baba", "Najjar", "Hajjar", "Shamoon", "Handal", 
#     "Awad", "Guirguis", "Awad", "Ganem", "Naifeh", "Khoury", "Hajjar", "Moghadam", "Mikhail", "Ghannam", 
#     "Guirguis", "Tannous", "Kanaan", "Handal", "Khoury", "Kalb", "Qureshi", "Najjar", "Atiyeh", "Gerges", 
#     "Nassar", "Tahan", "Hadad", "Fakhoury", "Salib", "Wasem", "Bitar", "Fakhoury", "Attia", "Awad", 
#     "Totah", "Deeb", "Touma", "Botros", "Nazari", "Nahas", "Kouri", "Ghannam", "Assaf", "Asfour", 
#     "Sarraf", "Naifeh", "Toma", "Asghar", "Abboud", "Issa", "Sabbag", "Sabbagh", "Isa", "Koury", 
#     "Kattan", "Shamoon", "Rahal", "Kalb", "Naser", "Masih", "Sayegh", "Dagher", "Asker", "Maroun", 
#     "Dagher", "Sleiman", "Botros", "Sleiman", "Harb", "Tahan", "Tuma", "Said", "Hadad", "Samaha", 
#     "Harb", "Cham", "Atiyeh", "Haik", "Malouf", "Bazzi", "Harb", "Malouf", "Ghanem", "Cham", "Asghar", 
#     "Samaha", "Khouri", "Nassar", "Rahal", "Baz", "Kalb", "Rahal", "Gerges", "Cham", "Sayegh", 
#     "Shadid", "Morcos", "Shamoon", "Hakimi", "Shamoon", "Qureshi", "Ganim", "Shadid", "Khoury", 
#     "Boutros", "Hanania", "Antoun", "Naifeh", "Deeb", "Samaha", "Awad", "Asghar", "Awad", "Saliba", 
#     "Shamoun", "Mikhail", "Hakimi", "Mikhail", "Cham", "Halabi", "Sarkis", "Kattan", "Nazari", 
#     "Safar", "Morcos", "Khoury", "Essa", "Nassar", "Haik", "Shadid", "Fakhoury", "Najjar", "Arian", 
#     "Botros", "Daher"
# ]


In [248]:
# french_names = [
#     "Abel", "Abraham", "Adam", "Albert", "Allard", "Archambault", "Armistead", "Arthur", "Augustin", 
#     "Babineaux", "Baudin", "Beauchene", "Beaulieu", "Beaumont", "Bélanger", "Bellamy", "Bellerose", 
#     "Belrose", "Berger", "Béringer", "Bernard", "Bertrand", "Bisset", "Bissette", "Blaise", "Blanc", 
#     "Blanchet", "Blanchett", "Bonfils", "Bonheur", "Bonhomme", "Bonnaire", "Bonnay", "Bonner", 
#     "Bonnet", "Borde", "Bordelon", "Bouchard", "Boucher", "Brisbois", "Brodeur", "Bureau", "Caron", 
#     "Cavey", "Chaput", "Charbonneau", "Charpentier", "Charron", "Chastain", "Chevalier", "Chevrolet", 
#     "Cloutier", "Colbert", "Comtois", "Cornett", "Coté", "Coupe", "Courtemanche", "Cousineau", 
#     "Couture", "Daniau", "D'aramitz", "Daviau", "David", "Deforest", "Degarmo", "Delacroix", 
#     "De la fontaine", "Deniau", "Deniaud", "Deniel", "Denis", "De sauveterre", "Deschamps", 
#     "Descoteaux", "Desjardins", "Desrochers", "Desrosiers", "Dubois", "Duchamps", "Dufort", "Dufour", 
#     "Duguay", "Dupond", "Dupont", "Durand", "Durant", "Duval", "Émile", "Eustis", "Fabian", "Fabre", 
#     "Fabron", "Faucher", "Faucheux", "Faure", "Favager", "Favre", "Favreau", "Fay", "Félix", "Firmin", 
#     "Fontaine", "Forest", "Forestier", "Fortier", "Foss", "Fournier", "Gage", "Gagne", "Gagnier", 
#     "Gagnon", "Garcon", "Gardinier", "Germain", "Géroux", "Giles", "Girard", "Giroux", "Glaisyer", 
#     "Gosse", "Gosselin", "Granger", "Guérin", "Guillory", "Hardy", "Harman", "Hébert", "Herbert", 
#     "Herriot", "Jacques", "Janvier", "Jordan", "Joubert", "Labelle", "Lachance", "Lachapelle", 
#     "Lamar", "Lambert", "Lane", "Langlais", "Langlois", "Lapointe", "Larue", "Laurent", "Lavigne", 
#     "Lavoie", "Leandres", "Lebeau", "Leblanc", "Leclair", "Leclerc", "Lécuyer", "Lefebvre", 
#     "Lefévre", "Lefurgey", "Legrand", "Lemaire", "Lémieux", "Leon", "Leroy", "Lesauvage", 
#     "Lestrange", "Lévêque", "Lévesque", "Linville", "Lyon", "Lyon", "Maçon", "Marchand", "Marie", 
#     "Marion", "Martel", "Martel", "Martin", "Masson", "Masson", "Mathieu", "Mercier", "Merle", 
#     "Michaud", "Michel", "Monet", "Monette", "Montagne", "Moreau", "Moulin", "Mullins", "Noel", 
#     "Oliver", "Olivier", "Page", "Paget", "Palomer", "Pan", "Pape", "Paquet", "Paquet", "Parent", 
#     "Paris", "Parris", "Pascal", "Patenaude", "Paternoster", "Paul", "Pelletier", "Perrault", 
#     "Perreault", "Perrot", "Petit", "Pettigrew", "Pierre", "Plamondon", "Plourde", "Poingdestre", 
#     "Poirier", "Porcher", "Poulin", "Proulx", "Renaud", "Rey", "Reyer", "Richard", "Richelieu", 
#     "Robert", "Roche", "Rome", "Romilly", "Rose", "Rousseau", "Roux", "Roy", "Royer", "Salomon", 
#     "Salvage", "Samson", "Samuel", "Sargent", "Sarkozi", "Sarkozy", "Sartre", "Sault", "Sauvage", 
#     "Sauvageau", "Sauvageon", "Sauvageot", "Sauveterre", "Savatier", "Segal", "Sergeant", 
#     "Séverin", "Simon", "Solomon", "Soucy", "St martin", "St pierre", "Tailler", "Tasse", 
#     "Thayer", "Thibault", "Thomas", "Tobias", "Tolbert", "Traver", "Travere", "Travers", "Traverse", 
#     "Travert", "Tremblay", "Tremble", "Victor", "Victors", "Villeneuve", "Vincent", "Vipond", 
#     "Voclain", "Yount"
# ]


In [244]:
def predict(name):
    for i in range(name.size()[0]):
        output, _ = mdl(name[i], hidden)
    return output

In [245]:
true = 0
wrong = 0
for name in french_names:
    res = predict(name)
    if res == 'French':
        true +=1
    else:
        wrong +=1
print(true, wrong)

AttributeError: 'str' object has no attribute 'size'

In [246]:
print(true/(true+wrong))

ZeroDivisionError: division by zero

In [240]:
predict('Wasem')

tensor([[ 0.0383, -0.1426, -0.2375,  0.2879,  1.2915,  0.4320, -1.3325, -0.4204,
         -0.3168, -1.2845, -0.1247, -0.3004, -0.1146, -0.4133, -0.3022, -0.1207,
          0.9196,  0.0189]], grad_fn=<AddmmBackward0>)

In [241]:
predict('Samuel')

tensor([[ 0.0383, -0.1426, -0.2375,  0.2879,  1.2915,  0.4320, -1.3325, -0.4204,
         -0.3168, -1.2845, -0.1247, -0.3004, -0.1146, -0.4133, -0.3022, -0.1207,
          0.9196,  0.0189]], grad_fn=<AddmmBackward0>)

In [None]:
# while True:
#     sentence = input("Input:")
#     if sentence == "quit":
#         break
    
#     predict(sentence)

Antonopoulos
Antonopoulos
Arvanitoyannis
Avgerinos
Banos
Batsakis
Bekyros
Belesis
Bertsimas
Bilias
Blades
Bouloukos
Brisimitzaki