[View in Colaboratory](https://colab.research.google.com/github/chokkan/deeplearning/blob/master/notebook/name.ipynb)

In [0]:
!wget https://download.pytorch.org/tutorial/data.zip

--2018-07-22 01:09:46--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.32.80.22, 13.32.80.66, 13.32.80.97, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.32.80.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2018-07-22 01:09:47 (4.96 MB/s) - ‘data.zip’ saved [2882130/2882130]



In [0]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [0]:
import string
import unicodedata

# Alphabet [a-zA-Z .,;']
alphabet = set(string.ascii_letters + " .,;'")

def normalize(s):
    # Apply canonical decomposition, and ignore non-alphabet symbols.
    return ''.join(
        c for c in unicodedata.normalize('NFD', s) if c in all_letters
        )

In [0]:
normalize('Ślusàrski')

'Slusarski'

In [0]:
import glob
import json
import os

data = {}
srcs = glob.glob('data/names/*.txt')
for src in srcs:
    lang = os.path.basename(src)[:-4]
    names = [normalize(s.strip('\n')) for s in open(src)]
    data[lang] = names
    
with open('names.json', 'w') as fo:
    json.dump(data, fo)

In [0]:
!pip install torch torchvision

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/69/43/380514bd9663f1bf708abeb359b8b48d3fabb1c8e95bb3427a980a064c57/torch-0.4.0-cp36-cp36m-manylinux1_x86_64.whl (484.0MB)
[K    100% |████████████████████████████████| 484.0MB 20kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x5be9a000 @  0x7f56db78d1c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8
[?25hCollecting torchvision
[?25l  Downloading https://files.pythonhosted.org/packages/ca/0d/f00b2885711e08bd71242ebe7b96561e6f6d01fdb4b9dcf4d37e2e13c5e1/torchvision-0.2.1-py2.py3-none-any.whl (54kB)
[K    100% |████████████████████████████████| 61kB 11.6MB/s 
Collecting pillow>=4.1.1 (from torchvision)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/24/f53ff6b61b3d728b90934bd

In [0]:
import json
data = json.load(open('names.json'))

In [0]:
def build_vocabulary(data):
    V = set()
    for lang, names in data.items():
        for name in names:
            for c in name:
                V.add(c)
    return sorted(V)

def build_labels(data):
    return data.keys()

def build_mapping(items):
    M = {}
    for item in items:
        M.setdefault(item, len(M))
    return M

In [0]:
V = build_vocabulary(data)
Vmap = build_mapping(V)
Y = build_labels(data)
Ymap = build_mapping(Y)

In [0]:
def build_dataset(data, Vmap, Ymap):
    D = []
    for lang, names in data.items():
        for name in names:
            D.append(([Vmap[c] for c in name], Ymap[lang]))
    return D


In [0]:
dataset = build_dataset(data, Vmap, Ymap)

In [0]:
dataset[0]

([3, 30, 33, 40], 0)

In [0]:
dataset[1]

([3, 30, 46, 29, 36, 29, 41], 0)

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

dtype = torch.float

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=1)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        output, hidden = self.rnn(input, hidden)
        output = self.fc(output[-1])
        return output
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

def label_to_tensor(l):
    tensor = torch.zeros(1, dtype=torch.long)
    tensor[0] = l
    return tensor

def seq_to_tensor(seq):
    tensor = torch.zeros(len(seq), 1, len(V), dtype=dtype)
    for i, l in enumerate(seq):
        tensor[i][0][l] = 1
    return tensor

model = SimpleRNN(len(V), 128, len(Y))
loss_fn = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for t in range(10):
    train_loss = 0.
    train_correct = 0
    random.shuffle(dataset)
    
    # Training loop for every instance.
    for (x, y) in dataset:
        input = seq_to_tensor(x)
        y = label_to_tensor(y)
        hidden = model.initHidden()
        
        # Make predictions with the current parameters.
        y_pred = model(input, hidden)
        _, predicted = torch.max(y_pred.data, 1)
        train_correct += (predicted == y).sum().item()
        
        # Compute the loss value.
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        
        # Update the parameters.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(t, train_loss, float(train_correct) / len(dataset))

0 33434.097751282156 0.5104612932151041
1 26835.434397691162 0.6079007671615024
2 23343.724399088707 0.6673806914416658
3 21579.962789889745 0.6916907442462887
4 20401.376432524296 0.7064860017933645
5 19530.10094793685 0.7160007970509116
6 18727.49529565179 0.7282056391351998
7 17974.368901541908 0.73737172461891
8 17453.31132408297 0.7436983162299492
9 16904.26970829886 0.7527149546677294


In [0]:
name = 'Okazaki'
x = seq_to_tensor([Vmap[c] for c in name])
hidden = model.initHidden()
y_pred = model(x, hidden)
scores = []
for lang, index in Ymap.items():
    scores.append((lang, float(y_pred[0][index])))

In [0]:
scores

[('French', -0.9161937236785889),
 ('Scottish', -2.1994919776916504),
 ('Italian', 0.7535350918769836),
 ('Portuguese', -1.298393726348877),
 ('English', -2.2725605964660645),
 ('Russian', 4.2647833824157715),
 ('Dutch', -0.439115047454834),
 ('German', -1.0358805656433105),
 ('Chinese', -1.7894344329833984),
 ('Arabic', 0.5966547131538391),
 ('Polish', 2.590327501296997),
 ('Japanese', 6.525790214538574),
 ('Czech', 2.0998802185058594),
 ('Irish', -0.326322078704834),
 ('Greek', -0.5004627704620361),
 ('Korean', -2.530669927597046),
 ('Spanish', -0.40204185247421265),
 ('Vietnamese', -1.2539485692977905)]