# 성으로 국가 맞추기

# Load Data

In [8]:
import glob

path = 'data/names'

filenames = glob.glob(path + '/*.txt')
print(filenames)
print(len(filenames))

['data/names\\Arabic.txt', 'data/names\\Chinese.txt', 'data/names\\Czech.txt', 'data/names\\Dutch.txt', 'data/names\\English.txt', 'data/names\\French.txt', 'data/names\\German.txt', 'data/names\\Greek.txt', 'data/names\\Irish.txt', 'data/names\\Italian.txt', 'data/names\\Japanese.txt', 'data/names\\Korean.txt', 'data/names\\Polish.txt', 'data/names\\Portuguese.txt', 'data/names\\Russian.txt', 'data/names\\Scottish.txt', 'data/names\\Spanish.txt', 'data/names\\Vietnamese.txt']
18


# 이름 사전 만들기
{국가1 : [이름1, 이름2], 국가2 : [이름1, 이름2]...}

In [220]:
import os

lan_name_dict = {}
lan_list = []

for filename in filenames:
    lan = os.path.splitext(os.path.basename(filename))[0]
    lan_list.append(lan)
    names = open(filename, encoding = 'utf-8').read().strip().split()
    lan_name_dict[lan] = names
    
len(lan_list)

18

# Ascii -> Unicode

In [233]:
import unicodedata

def Ascii_to_Unicode(name):
    unicode_name = ''.join([c for n in name for c in unicodedata.normalize('NFD', n)
                           if unicodedata.category(c) != 'Mn' and c in letters])
    return unicode_name

for lan in lan_list:
    names = lan_name_dict[lan]
    new_names = [Ascii_to_Unicode(name) for name in names]
    lan_name_dict[lan] = new_names

lan_name_dict['Italian'][:10]

['Abandonato',
 'Abatangelo',
 'Abatantuono',
 'Abate',
 'Abategiovanni',
 'Abatescianni',
 'Abba',
 'Abbadelli',
 'Abbascia',
 'Abbatangelo']

# Tensor 만들기

In [239]:
import torch
import string

letters = string.ascii_letters + " .,;"
letters_n = len(letters)

# Lookup -> 나중에 embedding도 도전


letter_tensor_dict = {}
for i, letter in enumerate(letters):
    tensor = torch.zeros(1, letters_n)
    tensor[0][i] = 1
    letter_tensor_dict[letter] = tensor

def Name_to_Tensor(name):
    name_tensor = torch.zeros(len(name), 1, letters_n)
    for i, n in enumerate(name):
        name_tensor[i][0] = letter_tensor_dict[n]
        
    return name_tensor

print(letter_tensor_dict['a'])
print(Name_to_Tensor('justin').size())

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])
torch.Size([6, 1, 56])


In [240]:
import torch.nn as nn

input_size = letters_n
hidden_size = 128
output_size = len(lan_list)

class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, inputs, hidden):
        
        combined = torch.cat([inputs, hidden], 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        
        return torch.zeros(1, self.hidden_size)
    
model = RNN(input_size, hidden_size, output_size)

In [241]:
import torch.optim as optim

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.05)

In [242]:
import random

def RandomTrainExample():
    
    random_n = random.randint(0, len(lan_list)-1)
    random_lan = lan_list[random_n]
    random_name_n = random.randint(0, len(random_lan)-1)
    random_name = lan_name_dict[random_lan][random_name_n]
    lan_tensor = torch.tensor([lan_list.index(random_lan)]).long()
    name_tensor = Name_to_Tensor(random_name)
    
    return random_lan, random_name, lan_tensor, name_tensor

RandomTrainExample()

('Polish',
 'Adamczyk',
 tensor([12]),
 tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0.]],
 
         [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [238]:
epoch_n = 1000

loss_list = []
for epoch in range(1, epoch_n+1):
    hidden = model.initHidden()
    loss_avg = 0
    optimizer.zero_grad()
    
    lan, name, lan_tensor, name_tensor = RandomTrainExample()
    for i in range(name_tensor.size()[0]):
        output, hidden = model(name_tensor[i], hidden)
        
    loss = loss_function(output, lan_tensor)
    loss.backward()
    loss += loss
    
    if epoch % (epoch_n / 10) == 0:
        

Abbott
Ajdrna
Stewart
Alt
Adamczyk
Maalouf
Achterberg
Huynh
Abel
Brown
Abel
Bernard
Adamidis
Alexandropoulos
Abaidulin
Nahas
Aalst
Adamczak
Abt
Almeida
Abel
Aihara
Abrahams
Barros
Abascal
Chang
Abategiovanni
Araullo
Albert
Antonowitsch
Aalsburg
Ban
Andrysiak
Alt
Vu
Ang
Abaimoff
Abaimoff
Aihara
Achteren
Adamou
Abatangelo
Aodha
Anderson
Bang
Bernard
Archambault
Abba
Baik
Araullo
Achterberg
Andrysiak
Bang
Abarca
Adamidis
Abbey
Ahn
Alves
Khoury
Thomson
Agelakos
Ababko
Brown
Achteren
Brown
Thomson
Ban
Auttenberg
Ababko
Adamou
Aalsburg
Chang
Abbing
Abt
Barros
Aodh
Campbell
Abraham
Alves
Aalst
Abbas
Ahn
Abukara
Abasolo
Huynh
Adamczak
Adsit
Abba
Abrahams
Tron
Cabral
Archambault
Adamczyk
Cabral
Abatangelo
Gerges
Aalsburg
Abraham
Abaroa
Abel
Araujo
Aalsburg
Bao
Nazari
Robertson
Aizawa
Byon
Ang
Agelakos
Adamczyk
Vo
Cabral
Adam
Abate
Abandonato
Aihara
Abandonato
Vo
Abbing
Ajibana
Aonghuis
Abaimoff
Abatantuono
Adam
Barros
Maalouf
Vo
Maalouf
Anderson
Abel
Abl
Alexandropoulos
Alexandropoulos
Gerges
K