# Classifing last names with character-level RNN

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [2]:
PATH = Path("/data2/yinterian/name_dataset/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/name_dataset/names_test.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv.gz'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_test.csv.gz')]

In [3]:
! head /data2/yinterian/name_dataset/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [4]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [5]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [6]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [7]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [8]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [9]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [10]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [11]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [12]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
test = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [13]:
batch_size = 2000
n = len(test)
train_dl = DataLoader(train, batch_size=batch_size)
test_dl = DataLoader(test, batch_size=n)

In [14]:
len(train), len(test)

(13374, 13374)

In [15]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model

In [16]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = F.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [17]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [18]:
x, y = next(iter(train_dl))

In [19]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [20]:
batch = x.shape[0]
h = model.initHidden(batch).cuda()
x = x.cuda().float()
y = y.cuda().long()

In [21]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [22]:
for ei in range(x.shape[1]):
    x_t, h = model(x[:,ei], h)

In [23]:
loss = F.cross_entropy(x_t, y)
loss

tensor(2.9183, device='cuda:0')

## Training

In [24]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [25]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [26]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch).cuda()
        loss = 0
        x = x.cuda().float()
        y = y.cuda().long()
        
        for t in range(x.shape[1]):
            out, h = model(x[:,t], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [27]:
def predict(model, test_dl):
    model.eval()
    x, y = next(iter(test_dl))
    x = x.cuda().float()
    y = y.cuda().long()
    N = x.shape[0]
    h = model.initHidden(N).cuda()
    for t in range(x.shape[1]):
        out, h = model(x[:,t], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    print("test loss %.3f and accuracy %.3f" % (loss.item(), acc.item()))

In [28]:
vocab_size = 55
hidden_size = 80
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [29]:
def train_loop(model, lr, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        if i%5 == 1: print("train loss %.3f" % loss)
    predict(model, test_dl)

In [40]:
train_loop(model, lr=0.01, epochs=20)

train loss 2.417
train loss 2.292
train loss 1.947
train loss 1.838
test loss 1.592 and accuracy 0.474


In [41]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.597
train loss 1.548
train loss 1.518
train loss 1.486
test loss 1.429 and accuracy 0.557


In [42]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.472
train loss 1.424
train loss 1.405
train loss 1.383
test loss 1.332 and accuracy 0.585


In [43]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.391
train loss 1.339
train loss 1.328
train loss 1.311
train loss 1.295
train loss 1.279
train loss 1.265
train loss 1.250
test loss 1.205 and accuracy 0.623


In [44]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.282
train loss 1.221
train loss 1.213
train loss 1.203
train loss 1.192
train loss 1.181
train loss 1.170
train loss 1.159
test loss 1.116 and accuracy 0.661


In [45]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.190
train loss 1.137
train loss 1.129
train loss 1.120
train loss 1.110
train loss 1.100
train loss 1.091
train loss 1.081
test loss 1.040 and accuracy 0.680


In [46]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.069
train loss 1.053
train loss 1.054
train loss 1.050
train loss 1.041
train loss 1.035
train loss 1.024
train loss 1.022
test loss 0.980 and accuracy 0.701


# Model with character embeddings 

In [47]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [48]:
train_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
test_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [49]:
batch_size = 2000
n = len(test)
train_2_dl = DataLoader(train_2, batch_size=batch_size)
test_2_dl = DataLoader(test_2, batch_size=n)

In [50]:
train_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32), 2)

In [51]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.long()
        x = self.emb(x)
        combined = torch.cat((x, hidden), 1)
        hidden = F.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [52]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs):
        loss = train(model, optim, train_2_dl)
        if i%5 == 1: print("train loss %.3f" % loss)
    predict(model, test_2_dl)

In [53]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes).cuda()
optim = get_optimizer(model, lr =0.01, wd = 0.0)

In [54]:
train_loop(model, epochs=40, lr=0.01, wd=0.0)

train loss 1.975
train loss 1.207
train loss 1.025
train loss 0.936
train loss 0.831
train loss 0.757
train loss 0.695
train loss 0.644
test loss 0.572 and accuracy 0.826


In [55]:
train_loop(model, epochs=20, lr=0.01, wd=0.0)

train loss 0.842
train loss 0.624
train loss 0.552
train loss 0.509
test loss 0.445 and accuracy 0.866


In [56]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.448
train loss 0.439
train loss 0.434
train loss 0.428
test loss 0.419 and accuracy 0.874


In [57]:
train_loop(model, epochs=40, lr=0.001, wd=0.00)

train loss 0.423
train loss 0.416
train loss 0.411
train loss 0.405
train loss 0.399
train loss 0.394
train loss 0.388
train loss 0.383
test loss 0.375 and accuracy 0.886


In [58]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.379
train loss 0.372
train loss 0.367
train loss 0.362
test loss 0.354 and accuracy 0.891
train loss 0.359
train loss 0.352
train loss 0.348
train loss 0.343
test loss 0.335 and accuracy 0.898


In [59]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.340
train loss 0.333
train loss 0.329
train loss 0.325
test loss 0.317 and accuracy 0.904
train loss 0.322
train loss 0.316
train loss 0.312
train loss 0.307
test loss 0.300 and accuracy 0.911


In [60]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.306
train loss 0.299
train loss 0.295
train loss 0.291
test loss 0.284 and accuracy 0.917
train loss 0.291
train loss 0.283
train loss 0.280
train loss 0.276
test loss 0.269 and accuracy 0.922


In [61]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.276
train loss 0.269
train loss 0.265
train loss 0.261
test loss 0.255 and accuracy 0.927
train loss 0.263
train loss 0.255
train loss 0.251
train loss 0.248
test loss 0.242 and accuracy 0.931


## Exercise
Change the first model to learn a character language model that generates last names.

# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs.