# Implement name classifier
1. With `GPU`
2. With `data parallel`
3. Use `pad-pack`

In [1]:
import typing
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from utils import *

torch.manual_seed(1249583)

# See the details in `Dataset` section
SEQUENCE_LENGTH = 19
COUNTRY_LENGTH = 18

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def train(model, device, train_loader, optimizer, epoch, criterion):
    """
    This function has one line different from the ordinary `train()` function
    It has `make_variables()` to convert tuple of names to be a tensor
    """
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        import ipdb; ipdb.set_trace()

        data, lengths = ordered_batch(data)
        
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        tmp = output.view(-1, COUNTRY_LENGTH)
        loss = criterion(tmp, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    y_test = []
    y_pred = []
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            """
            Will be next
            """
            data = make_variables(data)
            data, target = data.to(device), target.to(device)
            output = model(data)
            tmp = output.view(-1, COUNTRY_LENGTH)
            
            test_loss += criterion(tmp, target).item() # sum up batch loss
            pred = tmp.max(1, keepdim=True)[1] # get the index of the max log-probability

            pred_tmp = pred.view(-1)
            pred_list = pred_tmp.tolist()
            target_list = target.tolist()
            
            y_test += target_list
            y_pred += pred_list
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    # Confusion matrix
    confusion_mtx = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(confusion_mtx, classes=countries, normalize=True,
                          title='Confusion matrix')


In [8]:
tmp, lengths = ordered_batch(['Tiras', 'Ann', 'El'])
tmp, lengths

(tensor([[116, 105, 114,  97, 115],
         [ 97, 110, 110,   0,   0],
         [101, 108,   0,   0,   0]]), [5, 3, 2])

In [9]:
emb = nn.Embedding(128, 5)

In [10]:
embedding = emb(tmp)
embedding

tensor([[[-0.3663, -1.6736, -1.5389, -0.7287,  0.2120],
         [ 0.4269, -1.8379,  1.8960, -0.8565,  0.3572],
         [-0.9517, -1.1371,  1.5963,  0.3610,  0.7760],
         [ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
         [-0.0545, -1.4245,  1.9233,  0.6525,  0.3656]],

        [[ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
         [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
         [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
         [ 0.3030, -0.2241,  1.9057,  1.2253,  0.0273],
         [ 0.3030, -0.2241,  1.9057,  1.2253,  0.0273]],

        [[-0.3473,  1.7434,  1.8095,  0.2949, -0.2356],
         [ 0.0537,  2.5934,  0.3567,  1.0987,  1.1411],
         [ 0.3030, -0.2241,  1.9057,  1.2253,  0.0273],
         [ 0.3030, -0.2241,  1.9057,  1.2253,  0.0273],
         [ 0.3030, -0.2241,  1.9057,  1.2253,  0.0273]]],
       grad_fn=<EmbeddingBackward>)

In [11]:
embedding.size()

torch.Size([3, 5, 5])

In [13]:
from pprint import pprint

pps_in = torch.nn.utils.rnn.pack_padded_sequence(embedding, batch_first=True, lengths=lengths)
pps_in

PackedSequence(data=tensor([[-0.3663, -1.6736, -1.5389, -0.7287,  0.2120],
        [ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
        [-0.3473,  1.7434,  1.8095,  0.2949, -0.2356],
        [ 0.4269, -1.8379,  1.8960, -0.8565,  0.3572],
        [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
        [ 0.0537,  2.5934,  0.3567,  1.0987,  1.1411],
        [-0.9517, -1.1371,  1.5963,  0.3610,  0.7760],
        [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
        [ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
        [-0.0545, -1.4245,  1.9233,  0.6525,  0.3656]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([3, 3, 2, 1, 1]))

In [14]:
pps_out = torch.nn.utils.rnn.pad_packed_sequence(pps_in, batch_first=True)
pps_out

(tensor([[[-0.3663, -1.6736, -1.5389, -0.7287,  0.2120],
          [ 0.4269, -1.8379,  1.8960, -0.8565,  0.3572],
          [-0.9517, -1.1371,  1.5963,  0.3610,  0.7760],
          [ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
          [-0.0545, -1.4245,  1.9233,  0.6525,  0.3656]],
 
         [[ 0.0619,  2.4133,  2.6724,  0.5250, -0.1814],
          [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
          [-0.6600,  0.4255, -0.4647,  0.7076,  0.8170],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
 
         [[-0.3473,  1.7434,  1.8095,  0.2949, -0.2356],
          [ 0.0537,  2.5934,  0.3567,  1.0987,  1.1411],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
        grad_fn=<TransposeBackward0>), tensor([5, 3, 2]))

In [15]:
gru = nn.GRU(5, 18, 2)

In [22]:
pps_out_n_size, ht = gru(pps_in)

In [23]:
result = torch.nn.utils.rnn.pad_packed_sequence(pps_out_n_size, batch_first=True)

In [25]:
result

(tensor([[[ 0.1267,  0.0813, -0.1555,  0.1041, -0.1072, -0.1833, -0.0070,
            0.1165, -0.1294,  0.0291,  0.0284,  0.1789,  0.0305, -0.0236,
           -0.0580,  0.0699, -0.0144, -0.0230],
          [ 0.1818,  0.0287, -0.2037,  0.2784, -0.1972, -0.1245, -0.0552,
            0.1694, -0.1861,  0.0257,  0.1175,  0.2766,  0.0817, -0.0955,
           -0.0861,  0.1076,  0.0452, -0.0568],
          [ 0.1858,  0.0343, -0.2158,  0.2969, -0.2918, -0.0639, -0.0937,
            0.1674, -0.1966,  0.0027,  0.1714,  0.3586,  0.0530, -0.1475,
           -0.1228,  0.1286,  0.0940, -0.0035],
          [ 0.1314,  0.1084, -0.0795,  0.2666, -0.2950,  0.0697, -0.0708,
            0.1269, -0.1151,  0.0453,  0.2150,  0.2951,  0.1183, -0.0506,
           -0.1685,  0.1065,  0.1900, -0.0055],
          [ 0.1252,  0.0999, -0.0762,  0.2773, -0.3249,  0.1369, -0.0849,
            0.1273, -0.1031,  0.0541,  0.2703,  0.3272,  0.1012, -0.0660,
           -0.1749,  0.1244,  0.2232, -0.0129]],
 
         [[ 0.024

In [76]:
class ModelX(nn.Module):
    def __init__(self, input_size=256, hidden_size=5, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
#         batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
#         input = input.t()
#         import ipdb; ipdb.set_trace()
#         ipdb> input
#         tensor([[118,  97, 108,  99, 104, 105, 107, 111, 118, 115, 107, 105],
#             [ 99,  97, 114, 100, 111, 122, 111,   0,   0,   0,   0,   0]])
#         ipdb> input.size()
#         torch.Size([2, 12])
        input, lengths = ordered_batch(['Tiras', 'Ann', 'El'])

        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        
        
        pps_in = torch.nn.utils.rnn.pack_padded_sequence(embedded, batch_first=True, lengths=lengths)
        packed_output, hidden = self.gru(pps_in, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        result = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(result)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

In [77]:
# classifier = RNNClassifier()
classifier = ModelX()
arr, _ = str2ascii_arr('adylov')
inp = torch.tensor([arr], dtype=torch.long)
out = classifier(inp)
print(f"\nin: {inp.size()}, \nout: {out.size()}")

NameError: name 'str2ascii_arr' is not defined

In [243]:
names = ['adylov', 'solan', 'hard', 'san']
# classifier = RNNClassifier()
classifier = ModelX()
inputs = make_var(names)
out = classifier(inputs)
print(f"\nbatch in: {inputs.size()}, \nbatch out: {out.size()}")

 input size: torch.Size([6, 4])
 embeddding size: torch.Size([6, 4, 5])
 gru hidden output: torch.Size([1, 4, 5])
 fc output: torch.Size([1, 4, 18])

batch in: torch.Size([4, 6]), 
batch out: torch.Size([1, 4, 18])


In [49]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size=256, hidden_size=256, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

# Dataset

In [50]:
class NameDataSet(Dataset):
    def __init__(self, filename='names_train.csv'):
        trainset = pd.read_csv(filename, header=None)
        trainset.columns = ['name', 'country']
        countries = sorted(list(trainset.country.drop_duplicates()))

        self.trainset = trainset
        self.countries = countries
        self.len = len(trainset)        

    def __getitem__(self, index):
        country = self.trainset.iloc[index]['country']
        return self.trainset.iloc[index]['name'], self.countries.index(country)

    def __len__(self):
        return self.len
        

In [51]:
train_dataset = NameDataSet(filename='../lessons/names_train.csv')
test_dataset = NameDataSet(filename='../lessons/names_test.csv')

In [52]:
train_loader = DataLoader(dataset=train_dataset, sampler=ImbalancedDatasetSampler(train_dataset), batch_size=2, num_workers=2) # 2 * 9 * 743 
test_loader = DataLoader(dataset=test_dataset, sampler=ImbalancedDatasetSampler(test_dataset), batch_size=2, num_workers=2) # 4 * 25 * 67


# 1. Model

In [53]:
# model = RNNClassifier().to(DEVICE)

In [55]:
model = ModelX().to(DEVICE)

# 2. Criterion & Loss

In [56]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [59]:
for epoch in tqdm(range(1, 1 + 1)):
    train(model, DEVICE, train_loader, optimizer, epoch, criterion)
    test(model, DEVICE, test_loader, criterion)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

> [0;32m<ipython-input-58-0a850967a2f5>[0m(15)[0;36mtrain[0;34m()[0m
[0;32m     14 [0;31m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m        [0mdata[0m[0;34m,[0m [0mlengths[0m [0;34m=[0m [0mordered_batch[0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m
ipdb> list
[1;32m     10 [0m    """
[1;32m     11 [0m    [0mmodel[0m[0;34m.[0m[0mtrain[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[1;32m     12 [0m    [0;32mfor[0m [0mbatch_idx[0m[0;34m,[0m [0;34m([0m[0mdata[0m[0;34m,[0m [0mtarget[0m[0;34m)[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mtrain_loader[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[1;32m     13 [0m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[1;32m     14 [0m[0;34m[0m[0m
[0;32m---> 15 [0;31m        [0mdata[0m[0;34m,[0m [0mlengths[0m [0;34m=[0m [0mordered_batch[0m[0;34m([0m[0

BdbQuit: 