In [32]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import io
import os
import unicodedata
import string
import glob
import matplotlib.pyplot as plt
import random

In [33]:
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

In [34]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and c in ALL_LETTERS
    )

In [35]:
PATH_TO_DATA = '../input/names'

In [36]:
def find_files(path):
    return glob.glob(path)

In [37]:
def load_data():
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
    
    for filename in find_files(f'{PATH_TO_DATA}/*.txt'):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories

To represent a single letter, we use a “one-hot vector” of size 1 x n_letters. A one-hot vector is filled with 0s except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.

To make a word we join a bunch of those into a 2D matrix <line_length x 1 x n_letters>.

That extra 1 dimension is because PyTorch assumes everything is in batches - we’re just using a batch size of 1 here.


In [38]:
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

In [39]:
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

In [40]:
def category_from_output(output):
    category_idx = torch.argmax(output).item()
    return all_categories[category_idx]

In [41]:
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]

In [42]:
def read_lines_from_files(directory):
    results = []  # To store the tuples
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    line_content = line.strip()  # Remove leading/trailing whitespace
                    results.append((line_content, filename.replace('.txt', '')))
    return results

In [43]:
category_lines, all_categories = load_data()
n_categories = len(all_categories)

In [44]:
lines_with_files = read_lines_from_files(PATH_TO_DATA)

In [45]:
class NamesDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name, language = self.data[idx]
        return name, language

In [46]:
ds = NamesDataset(lines_with_files)

In [47]:
dl = DataLoader(ds, batch_size=1, shuffle=True)

In [116]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'

In [117]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_tensor, hidden_state):
        # LSTM expects input in the shape (batch, seq_len, input_size)
        # Reshape input_tensor to include batch and seq_len
        input_tensor = input_tensor.unsqueeze(1)
        output, (hidden, cell) = self.lstm(input_tensor, hidden_state)
        
        # Pass the LSTM's hidden state through the output layer
        output = self.fc(output.squeeze(1))  # Remove the seq_len dimension
        return output, (hidden, cell)
    
    def init_hidden(self):
        # Initialize both hidden and cell states as zeros
        return (torch.zeros(1, 1, self.hidden_size, device=device),  # Hidden state
                torch.zeros(1, 1, self.hidden_size, device=device))  # Cell state

In [118]:
def train_helper(line_tensor, category_tensor, mdl):
    hidden = mdl.init_hidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = mdl(line_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return output, loss.item()

In [119]:
num_of_letters = len(ALL_LETTERS)

In [120]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
num_epochs=10

In [121]:
mdl = LSTM(num_of_letters, 100, n_categories).to(device)
optimizer = torch.optim.Adam(mdl.parameters(), lr=learning_rate)

In [122]:
def train(mdl):
    counter = 0
    losses = []
    num_correct = 0
    num_wrong = 0

    for i in range(num_epochs):
        for name, language in dl:
            language = language[0]
            name = name[0]
            category_tensor = torch.tensor([all_categories.index(language)], dtype=torch.long).to(device)
            line_tensor = line_to_tensor(name).to(device)

            output, loss = train_helper(line_tensor, category_tensor, mdl)
            # current_loss += loss 
            counter +=1
            if counter % 1000 == 0:
                guess = category_from_output(output)
                correct_or_wrong = f"CORRECT {name, language}" if guess == language else f"WRONG ({language})"
                print(correct_or_wrong)
                if guess == language:
                    num_correct +=1
                else:
                    num_wrong+=1    
        print('loss is', loss)
    return num_correct, num_wrong

In [133]:
%%time
num_correct, num_wrong = train(mdl)
num_correct / (num_correct + num_wrong)

CORRECT ('Pietri', 'Italian')
WRONG (Korean)
CORRECT ('Otomo', 'Japanese')
CORRECT ('Teunissen', 'Dutch')
loss is 0.003699840744957328
CORRECT ('Zangari', 'Italian')
CORRECT ('Novotny', 'Czech')
CORRECT ('Wirth', 'German')
CORRECT ('Amadori', 'Italian')
CORRECT ('De la cruz', 'Spanish')
loss is 0.38489726185798645
CORRECT ('Kerner', 'German')
CORRECT ('Luc', 'Vietnamese')
CORRECT ('Qing', 'Chinese')
CORRECT ('Schlusser', 'German')
WRONG (Dutch)
loss is 0.02932055853307247
CORRECT ('Jones', 'Scottish')
CORRECT ('Tomasek', 'Czech')
CORRECT ('Egami', 'Japanese')
CORRECT ('Sarti', 'Italian')
CORRECT ('Paquet', 'French')
loss is 0.07192995399236679
CORRECT ('Atshushi', 'Japanese')
CORRECT ('Gim', 'Chinese')
CORRECT ('Schultze', 'German')
CORRECT ('Schuchard', 'German')
CORRECT ('Kalbfleisch', 'German')
loss is 2.95634672511369e-05
CORRECT ('Phi', 'Vietnamese')
CORRECT ('Noel', 'French')
CORRECT ('Merta', 'Czech')
CORRECT ('Okazawaya', 'Japanese')
CORRECT ('Vlach', 'Czech')
loss is 0.0597529

0.9183673469387755

In [127]:
def predict(input_line, verbose=False):
    with torch.no_grad():
        line_tensor = line_to_tensor(input_line).to(device)
        
        hidden = mdl.init_hidden()
    
        for i in range(line_tensor.size()[0]):
            output, hidden = mdl(line_tensor[i], hidden)
        
        guess = category_from_output(output)
        if verbose is True:
            print(guess)
        return guess

In [128]:
predict('Jaskulski', True)

Polish


'Polish'

In [129]:
arabic_names = [
    "Khoury", "Nahas", "Daher", "Gerges", "Nazari", "Maalouf", "Gerges", "Naifeh", "Guirguis", 
    "Baba", "Sabbagh", "Attia", "Tahan", "Haddad", "Aswad", "Najjar", "Dagher", "Maloof", "Isa", 
    "Asghar", "Nader", "Gaber", "Abboud", "Maalouf", "Zogby", "Srour", "Bahar", "Mustafa", 
    "Hanania", "Daher", "Tuma", "Nahas", "Saliba", "Shamoon", "Handal", "Baba", "Amari", "Bahar", 
    "Atiyeh", "Said", "Khouri", "Tahan", "Baba", "Mustafa", "Guirguis", "Sleiman", "Seif", "Dagher", 
    "Bahar", "Gaber", "Harb", "Seif", "Asker", "Nader", "Antar", "Awad", "Srour", "Shadid", "Hajjar", 
    "Hanania", "Kalb", "Shadid", "Bazzi", "Mustafa", "Masih", "Ghanem", "Haddad", "Isa", "Antoun", 
    "Sarraf", "Sleiman", "Dagher", "Najjar", "Malouf", "Nahas", "Naser", "Saliba", "Shamon", "Malouf", 
    "Kalb", "Daher", "Maalouf", "Wasem", "Kanaan", "Naifeh", "Boutros", "Moghadam", "Masih", "Sleiman", 
    "Aswad", "Cham", "Assaf", "Quraishi", "Shalhoub", "Sabbag", "Mifsud", "Gaber", "Shammas", "Tannous", 
    "Sleiman", "Bazzi", "Quraishi", "Rahal", "Cham", "Ghanem", "Ghanem", "Naser", "Baba", "Shamon", 
    "Almasi", "Basara", "Quraishi", "Bata", "Wasem", "Shamoun", "Deeb", "Touma", "Asfour", "Deeb", 
    "Hadad", "Naifeh", "Touma", "Bazzi", "Shamoun", "Nahas", "Haddad", "Arian", "Kouri", "Deeb", 
    "Toma", "Halabi", "Nazari", "Saliba", "Fakhoury", "Hadad", "Baba", "Mansour", "Sayegh", "Antar", 
    "Deeb", "Morcos", "Shalhoub", "Sarraf", "Amari", "Wasem", "Ganim", "Tuma", "Fakhoury", "Hadad", 
    "Hakimi", "Nader", "Said", "Ganim", "Daher", "Ganem", "Tuma", "Boutros", "Aswad", "Sarkis", "Daher", 
    "Toma", "Boutros", "Kanaan", "Antar", "Gerges", "Kouri", "Maroun", "Wasem", "Dagher", "Naifeh", 
    "Bishara", "Ba", "Cham", "Kalb", "Bazzi", "Bitar", "Hadad", "Moghadam", "Sleiman", "Shamoun", 
    "Antar", "Atiyeh", "Koury", "Nahas", "Kouri", "Maroun", "Nassar", "Sayegh", "Haik", "Ghanem", 
    "Sayegh", "Salib", "Cham", "Bata", "Touma", "Antoun", "Antar", "Bata", "Botros", "Shammas", "Ganim", 
    "Sleiman", "Seif", "Moghadam", "Ba", "Tannous", "Bazzi", "Seif", "Salib", "Hadad", "Quraishi", 
    "Halabi", "Essa", "Bahar", "Kattan", "Boutros", "Nahas", "Sabbagh", "Kanaan", "Sayegh", "Said", 
    "Botros", "Najjar", "Toma", "Bata", "Atiyeh", "Halabi", "Tannous", "Kouri", "Shamoon", "Kassis", 
    "Haddad", "Tuma", "Mansour", "Antar", "Kassis", "Kalb", "Basara", "Rahal", "Mansour", "Handal", 
    "Morcos", "Fakhoury", "Hadad", "Morcos", "Kouri", "Quraishi", "Almasi", "Awad", "Naifeh", "Koury", 
    "Asker", "Maroun", "Fakhoury", "Sabbag", "Sarraf", "Shamon", "Assaf", "Boutros", "Malouf", "Nassar", 
    "Qureshi", "Ghanem", "Srour", "Almasi", "Qureshi", "Ghannam", "Mustafa", "Najjar", "Kassab", "Shadid", 
    "Shamoon", "Morcos", "Atiyeh", "Isa", "Ba", "Baz", "Asker", "Seif", "Asghar", "Hajjar", "Deeb", 
    "Essa", "Qureshi", "Abboud", "Ganem", "Haddad", "Koury", "Nassar", "Abadi", "Toma", "Tannous", 
    "Harb", "Issa", "Khouri", "Mifsud", "Kalb", "Gaber", "Ganim", "Boulos", "Samaha", "Haddad", 
    "Sabbag", "Wasem", "Dagher", "Rahal", "Atiyeh", "Antar", "Asghar", "Mansour", "Awad", "Boulos", 
    "Sarraf", "Deeb", "Abadi", "Nazari", "Daher", "Gerges", "Shamoon", "Gaber", "Amari", "Sarraf", 
    "Nazari", "Saliba", "Naifeh", "Nazari", "Hakimi", "Shamon", "Abboud", "Quraishi", "Tahan", "Safar", 
    "Hajjar", "Srour", "Gaber", "Shalhoub", "Attia", "Safar", "Said", "Ganem", "Nader", "Asghar", 
    "Mustafa", "Said", "Antar", "Botros", "Nader", "Ghannam", "Asfour", "Tahan", "Mansour", "Attia", 
    "Touma", "Najjar", "Kassis", "Abboud", "Bishara", "Bazzi", "Shalhoub", "Shalhoub", "Safar", "Khoury", 
    "Nazari", "Sabbag", "Sleiman", "Atiyeh", "Kouri", "Bitar", "Zogby", "Ghanem", "Assaf", "Abadi", 
    "Arian", "Shalhoub", "Khoury", "Morcos", "Shamon", "Wasem", "Abadi", "Antoun", "Baz", "Naser", 
    "Assaf", "Saliba", "Nader", "Mikhail", "Naser", "Daher", "Morcos", "Awad", "Nahas", "Sarkis", 
    "Malouf", "Mustafa", "Fakhoury", "Ghannam", "Shadid", "Gaber", "Koury", "Atiyeh", "Shamon", "Boutros", 
    "Sarraf", "Arian", "Fakhoury", "Abadi", "Kassab", "Nahas", "Quraishi", "Mansour", "Samaha", "Wasem", 
    "Seif", "Fakhoury", "Saliba", "Cham", "Bahar", "Shamoun", "Essa", "Shamon", "Asfour", "Bitar", 
    "Cham", "Tahan", "Tannous", "Daher", "Khoury", "Shamon", "Bahar", "Quraishi", "Ghannam", "Kassab", 
    "Zogby", "Basara", "Shammas", "Arian", "Sayegh", "Naifeh", "Mifsud", "Sleiman", "Arian", "Kassis", 
    "Shamoun", "Kassis", "Harb", "Mustafa", "Boulos", "Asghar", "Shamon", "Kanaan", "Atiyeh", "Kassab", 
    "Tahan", "Bazzi", "Kassis", "Qureshi", "Basara", "Shalhoub", "Sayegh", "Haik", "Attia", "Maroun", 
    "Kassis", "Sarkis", "Harb", "Assaf", "Kattan", "Antar", "Sleiman", "Touma", "Sarraf", "Bazzi", 
    "Boulos", "Baz", "Issa", "Shamon", "Shadid", "Deeb", "Sabbag", "Wasem", "Awad", "Mansour", "Saliba", 
    "Fakhoury", "Arian", "Bishara", "Dagher", "Bishara", "Koury", "Fakhoury", "Naser", "Nader", "Antar", 
    "Gerges", "Handal", "Hanania", "Shadid", "Gerges", "Kassis", "Essa", "Assaf", "Shadid", "Seif", 
    "Shalhoub", "Shamoun", "Hajjar", "Baba", "Sayegh", "Mustafa", "Sabbagh", "Isa", "Najjar", "Tannous", 
    "Hanania", "Ganem", "Gerges", "Fakhoury", "Mifsud", "Nahas", "Bishara", "Bishara", "Abadi", "Sarkis", 
    "Masih", "Isa", "Attia", "Kalb", "Essa", "Boulos", "Basara", "Halabi", "Halabi", "Dagher", "Attia", 
    "Kassis", "Tuma", "Gerges", "Ghannam", "Toma", "Baz", "Asghar", "Zogby", "Aswad", "Hadad", "Dagher", 
    "Naser", "Shadid", "Atiyeh", "Zogby", "Abboud", "Tannous", "Khouri", "Atiyeh", "Ganem", "Maalouf", 
    "Isa", "Maroun", "Issa", "Khouri", "Harb", "Nader", "Awad", "Nahas", "Said", "Baba", "Totah", "Ganim", 
    "Handal", "Mansour", "Basara", "Malouf", "Said", "Botros", "Samaha", "Safar", "Tahan", "Botros", 
    "Shamoun", "Handal", "Sarraf", "Malouf", "Bishara", "Aswad", "Khouri", "Baz", "Asker", "Toma", 
    "Koury", "Gerges", "Bishara", "Boulos", "Najjar", "Aswad", "Shamon", "Kouri", "Srour", "Assaf", 
    "Tannous", "Attia", "Mustafa", "Kattan", "Asghar", "Amari", "Shadid", "Said", "Bazzi", "Masih", 
    "Antar", "Fakhoury", "Shadid", "Masih", "Handal", "Sarraf", "Kassis", "Salib", "Hajjar", "Totah", 
    "Koury", "Totah", "Mustafa", "Sabbagh", "Moghadam", "Toma", "Srour", "Almasi", "Totah", "Maroun", 
    "Kattan", "Naifeh", "Sarkis", "Mikhail", "Nazari", "Boutros", "Guirguis", "Gaber", "Kassis", "Masih", 
    "Hanania", "Maloof", "Quraishi", "Cham", "Hadad", "Tahan", "Bitar", "Arian", "Gaber", "Baz", 
    "Mansour", "Kalb", "Sarkis", "Attia", "Antar", "Asfour", "Said", "Essa", "Koury", "Hadad", "Tuma", 
    "Moghadam", "Sabbagh", "Amari", "Dagher", "Srour", "Antoun", "Sleiman", "Maroun", "Tuma", "Nahas", 
    "Hanania", "Sayegh", "Amari", "Sabbagh", "Said", "Cham", "Asker", "Nassar", "Bitar", "Said", "Dagher", 
    "Safar", "Khouri", "Totah", "Khoury", "Salib", "Basara", "Abboud", "Baz", "Isa", "Cham", "Amari", 
    "Mifsud", "Hadad", "Rahal", "Khoury", "Bazzi", "Basara", "Totah", "Ghannam", "Koury", "Malouf", 
    "Zogby", "Zogby", "Boutros", "Nassar", "Handal", "Hajjar", "Maloof", "Abadi", "Maroun", "Mifsud", 
    "Kalb", "Amari", "Hakimi", "Boutros", "Masih", "Kattan", "Haddad", "Arian", "Nazari", "Assaf", 
    "Attia", "Wasem", "Gerges", "Asker", "Tahan", "Fakhoury", "Shadid", "Sarraf", "Attia", "Naifeh", 
    "Aswad", "Deeb", "Tannous", "Totah", "Cham", "Baba", "Najjar", "Hajjar", "Shamoon", "Handal", 
    "Awad", "Guirguis", "Awad", "Ganem", "Naifeh", "Khoury", "Hajjar", "Moghadam", "Mikhail", "Ghannam", 
    "Guirguis", "Tannous", "Kanaan", "Handal", "Khoury", "Kalb", "Qureshi", "Najjar", "Atiyeh", "Gerges", 
    "Nassar", "Tahan", "Hadad", "Fakhoury", "Salib", "Wasem", "Bitar", "Fakhoury", "Attia", "Awad", 
    "Totah", "Deeb", "Touma", "Botros", "Nazari", "Nahas", "Kouri", "Ghannam", "Assaf", "Asfour", 
    "Sarraf", "Naifeh", "Toma", "Asghar", "Abboud", "Issa", "Sabbag", "Sabbagh", "Isa", "Koury", 
    "Kattan", "Shamoon", "Rahal", "Kalb", "Naser", "Masih", "Sayegh", "Dagher", "Asker", "Maroun", 
    "Dagher", "Sleiman", "Botros", "Sleiman", "Harb", "Tahan", "Tuma", "Said", "Hadad", "Samaha", 
    "Harb", "Cham", "Atiyeh", "Haik", "Malouf", "Bazzi", "Harb", "Malouf", "Ghanem", "Cham", "Asghar", 
    "Samaha", "Khouri", "Nassar", "Rahal", "Baz", "Kalb", "Rahal", "Gerges", "Cham", "Sayegh", 
    "Shadid", "Morcos", "Shamoon", "Hakimi", "Shamoon", "Qureshi", "Ganim", "Shadid", "Khoury", 
    "Boutros", "Hanania", "Antoun", "Naifeh", "Deeb", "Samaha", "Awad", "Asghar", "Awad", "Saliba", 
    "Shamoun", "Mikhail", "Hakimi", "Mikhail", "Cham", "Halabi", "Sarkis", "Kattan", "Nazari", 
    "Safar", "Morcos", "Khoury", "Essa", "Nassar", "Haik", "Shadid", "Fakhoury", "Najjar", "Arian", 
    "Botros", "Daher"
]


In [130]:
french_names = [
    "Abel", "Abraham", "Adam", "Albert", "Allard", "Archambault", "Armistead", "Arthur", "Augustin", 
    "Babineaux", "Baudin", "Beauchene", "Beaulieu", "Beaumont", "Bélanger", "Bellamy", "Bellerose", 
    "Belrose", "Berger", "Béringer", "Bernard", "Bertrand", "Bisset", "Bissette", "Blaise", "Blanc", 
    "Blanchet", "Blanchett", "Bonfils", "Bonheur", "Bonhomme", "Bonnaire", "Bonnay", "Bonner", 
    "Bonnet", "Borde", "Bordelon", "Bouchard", "Boucher", "Brisbois", "Brodeur", "Bureau", "Caron", 
    "Cavey", "Chaput", "Charbonneau", "Charpentier", "Charron", "Chastain", "Chevalier", "Chevrolet", 
    "Cloutier", "Colbert", "Comtois", "Cornett", "Coté", "Coupe", "Courtemanche", "Cousineau", 
    "Couture", "Daniau", "D'aramitz", "Daviau", "David", "Deforest", "Degarmo", "Delacroix", 
    "De la fontaine", "Deniau", "Deniaud", "Deniel", "Denis", "De sauveterre", "Deschamps", 
    "Descoteaux", "Desjardins", "Desrochers", "Desrosiers", "Dubois", "Duchamps", "Dufort", "Dufour", 
    "Duguay", "Dupond", "Dupont", "Durand", "Durant", "Duval", "Émile", "Eustis", "Fabian", "Fabre", 
    "Fabron", "Faucher", "Faucheux", "Faure", "Favager", "Favre", "Favreau", "Fay", "Félix", "Firmin", 
    "Fontaine", "Forest", "Forestier", "Fortier", "Foss", "Fournier", "Gage", "Gagne", "Gagnier", 
    "Gagnon", "Garcon", "Gardinier", "Germain", "Géroux", "Giles", "Girard", "Giroux", "Glaisyer", 
    "Gosse", "Gosselin", "Granger", "Guérin", "Guillory", "Hardy", "Harman", "Hébert", "Herbert", 
    "Herriot", "Jacques", "Janvier", "Jordan", "Joubert", "Labelle", "Lachance", "Lachapelle", 
    "Lamar", "Lambert", "Lane", "Langlais", "Langlois", "Lapointe", "Larue", "Laurent", "Lavigne", 
    "Lavoie", "Leandres", "Lebeau", "Leblanc", "Leclair", "Leclerc", "Lécuyer", "Lefebvre", 
    "Lefévre", "Lefurgey", "Legrand", "Lemaire", "Lémieux", "Leon", "Leroy", "Lesauvage", 
    "Lestrange", "Lévêque", "Lévesque", "Linville", "Lyon", "Lyon", "Maçon", "Marchand", "Marie", 
    "Marion", "Martel", "Martel", "Martin", "Masson", "Masson", "Mathieu", "Mercier", "Merle", 
    "Michaud", "Michel", "Monet", "Monette", "Montagne", "Moreau", "Moulin", "Mullins", "Noel", 
    "Oliver", "Olivier", "Page", "Paget", "Palomer", "Pan", "Pape", "Paquet", "Paquet", "Parent", 
    "Paris", "Parris", "Pascal", "Patenaude", "Paternoster", "Paul", "Pelletier", "Perrault", 
    "Perreault", "Perrot", "Petit", "Pettigrew", "Pierre", "Plamondon", "Plourde", "Poingdestre", 
    "Poirier", "Porcher", "Poulin", "Proulx", "Renaud", "Rey", "Reyer", "Richard", "Richelieu", 
    "Robert", "Roche", "Rome", "Romilly", "Rose", "Rousseau", "Roux", "Roy", "Royer", "Salomon", 
    "Salvage", "Samson", "Samuel", "Sargent", "Sarkozi", "Sarkozy", "Sartre", "Sault", "Sauvage", 
    "Sauvageau", "Sauvageon", "Sauvageot", "Sauveterre", "Savatier", "Segal", "Sergeant", 
    "Séverin", "Simon", "Solomon", "Soucy", "St martin", "St pierre", "Tailler", "Tasse", 
    "Thayer", "Thibault", "Thomas", "Tobias", "Tolbert", "Traver", "Travere", "Travers", "Traverse", 
    "Travert", "Tremblay", "Tremble", "Victor", "Victors", "Villeneuve", "Vincent", "Vipond", 
    "Voclain", "Yount"
]


In [131]:
true = 0
wrong = 0
for name in arabic_names:
    res = predict(name)
    if res == 'Arabic':
        true +=1
    else:
        wrong +=1
        
print((true / (true+wrong)))

0.0


In [132]:
true = 0
wrong = 0
for name in french_names:
    res = predict(name)
    if res == 'French':
        true +=1
    else:
        wrong +=1
        
print((true / (true+wrong)))

0.8916967509025271


In [135]:
PATH_TO_DATA

'../input/names'

In [137]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Read the data
folder_path = "path_to_your_folder"  # Replace with your folder path

data = []
labels = []

for file_name in os.listdir(PATH_TO_DATA):
    if file_name.endswith(".txt"):
        language = file_name.replace(".txt", "")
        with open(os.path.join(PATH_TO_DATA, file_name), 'r', encoding='utf-8') as file:
            for line in file:
                data.append(line.strip())
                labels.append(language)

# Step 2: Prepare the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
y = labels

# Step 3: Train the logistic regression model
model = LogisticRegression(max_iter=3000)
model.fit(X, y)

# Step 4: Evaluate the model (using the same data for testing)
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 61.42%


In [139]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Read the data
folder_path = "path_to_your_folder"  # Replace with your folder path

data = []
labels = []

for file_name in os.listdir(PATH_TO_DATA):
    if file_name.endswith(".txt"):
        language = file_name.replace(".txt", "")
        with open(os.path.join(PATH_TO_DATA, file_name), 'r', encoding='utf-8') as file:
            for line in file:
                data.append(line.strip())
                labels.append(language)

# Step 2: Prepare the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
y = labels

# Step 3: Train the random forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Step 4: Evaluate the model (using the same data for testing)
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 95.38%
