# Implement name classifier
1. With `GPU`
2. With `data parallel`
3. Use `pad-pack`

In [1]:
import typing
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from utils import *

torch.manual_seed(1249583)

# See the details in `Dataset` section
SEQUENCE_LENGTH = 19
COUNTRY_LENGTH = 18

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [42]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def train(model, device, train_loader, optimizer, epoch, criterion):
    """
    This function has one line different from the ordinary `train()` function
    It has `make_variables()` to convert tuple of names to be a tensor
    """
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = make_var(data)        
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        import ipdb; ipdb.set_trace()
        tmp = output.view(-1, COUNTRY_LENGTH)
        loss = criterion(tmp, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    y_test = []
    y_pred = []
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            data = make_var(data)
            data, target = data.to(device), target.to(device)
            output = model(data)
            tmp = output.view(-1, COUNTRY_LENGTH)
            
            test_loss += criterion(tmp, target).item() # sum up batch loss
            pred = tmp.max(1, keepdim=True)[1] # get the index of the max log-probability

            pred_tmp = pred.view(-1)
            pred_list = pred_tmp.tolist()
            target_list = target.tolist()
            
            y_test += target_list
            y_pred += pred_list
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    # Confusion matrix
    confusion_mtx = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(confusion_mtx, classes=countries, normalize=True,
                          title='Confusion matrix')


In [6]:
tmp = make_var(['Tiras', 'Ann', 'El'])
tmp

tensor([[116, 105, 114,  97, 115],
        [ 97, 110, 110,   0,   0],
        [101, 108,   0,   0,   0]])

In [15]:
lengths = count_non_zero_length(tmp)
lengths

[5, 3, 2]

In [16]:
emb = nn.Embedding(128, 5)

In [17]:
embedding = emb(tmp)
embedding

tensor([[[ 0.6439,  0.2169, -0.8762, -1.5254,  0.3906],
         [-0.9603,  1.1842, -0.7863,  0.0424,  0.6137],
         [-0.2403,  0.8440, -1.5594, -0.3972,  0.6235],
         [-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
         [-0.2568,  0.3000,  0.1996, -0.7259,  0.1190]],

        [[-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
         [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
         [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
         [ 0.2763,  0.8555, -1.6694,  0.0883, -0.4541],
         [ 0.2763,  0.8555, -1.6694,  0.0883, -0.4541]],

        [[-0.4044,  0.1883,  0.6204,  1.1825, -1.1356],
         [ 1.8757, -1.0296,  0.5458, -0.4489,  0.0555],
         [ 0.2763,  0.8555, -1.6694,  0.0883, -0.4541],
         [ 0.2763,  0.8555, -1.6694,  0.0883, -0.4541],
         [ 0.2763,  0.8555, -1.6694,  0.0883, -0.4541]]],
       grad_fn=<EmbeddingBackward>)

In [18]:
embedding.size()

torch.Size([3, 5, 5])

In [19]:
from pprint import pprint

pps_in = torch.nn.utils.rnn.pack_padded_sequence(embedding, batch_first=True, lengths=lengths)
pps_in

PackedSequence(data=tensor([[ 0.6439,  0.2169, -0.8762, -1.5254,  0.3906],
        [-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
        [-0.4044,  0.1883,  0.6204,  1.1825, -1.1356],
        [-0.9603,  1.1842, -0.7863,  0.0424,  0.6137],
        [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
        [ 1.8757, -1.0296,  0.5458, -0.4489,  0.0555],
        [-0.2403,  0.8440, -1.5594, -0.3972,  0.6235],
        [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
        [-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
        [-0.2568,  0.3000,  0.1996, -0.7259,  0.1190]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([3, 3, 2, 1, 1]))

In [20]:
pps_out = torch.nn.utils.rnn.pad_packed_sequence(pps_in, batch_first=True)
pps_out

(tensor([[[ 0.6439,  0.2169, -0.8762, -1.5254,  0.3906],
          [-0.9603,  1.1842, -0.7863,  0.0424,  0.6137],
          [-0.2403,  0.8440, -1.5594, -0.3972,  0.6235],
          [-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
          [-0.2568,  0.3000,  0.1996, -0.7259,  0.1190]],
 
         [[-0.1685,  1.4149, -0.6990, -0.6131, -0.3523],
          [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
          [-1.1444,  0.3401,  0.2428,  0.3262, -0.0667],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
 
         [[-0.4044,  0.1883,  0.6204,  1.1825, -1.1356],
          [ 1.8757, -1.0296,  0.5458, -0.4489,  0.0555],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
        grad_fn=<TransposeBackward0>), tensor([5, 3, 2]))

In [21]:
gru = nn.GRU(5, 18, 2)

In [22]:
pps_out_n_size, ht = gru(pps_in)

In [23]:
result = torch.nn.utils.rnn.pad_packed_sequence(pps_out_n_size, batch_first=True)

In [24]:
result

(tensor([[[-0.0398, -0.0322, -0.0324,  0.0610, -0.1467,  0.0311,  0.0448,
           -0.0966,  0.0651, -0.0526,  0.0418,  0.1969, -0.1536,  0.0622,
            0.1267,  0.0826,  0.0359, -0.0155],
          [-0.0782, -0.1359, -0.1038,  0.2108, -0.1943,  0.0844,  0.0511,
           -0.1212,  0.0822, -0.1003,  0.0061,  0.2858, -0.1862,  0.0660,
            0.1276,  0.1697,  0.0536, -0.0362],
          [-0.1033, -0.1874, -0.1538,  0.3281, -0.2124,  0.1250,  0.0410,
           -0.1497,  0.0773, -0.1291, -0.0145,  0.3420, -0.1857,  0.0656,
            0.1044,  0.2455,  0.0378, -0.0688],
          [-0.1629, -0.1956, -0.1837,  0.3789, -0.2141,  0.1141,  0.0493,
           -0.1904,  0.0883, -0.1211,  0.0082,  0.3449, -0.2034,  0.0834,
            0.0911,  0.3039,  0.0361, -0.0964],
          [-0.1970, -0.1027, -0.1533,  0.3665, -0.2631,  0.0814,  0.0782,
           -0.1954,  0.0884, -0.1026,  0.0082,  0.3306, -0.2201,  0.1205,
            0.1267,  0.3046,  0.0361, -0.0902]],
 
         [[-0.075

In [58]:
class ModelX(nn.Module):
    def __init__(self, input_size=256, hidden_size=5, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
        # Decoder layer to tune up the `output` dimension
        # TODO
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        lengths = count_non_zero_length(input)
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        
        pps_in = torch.nn.utils.rnn.pack_padded_sequence(embedded, batch_first=True, lengths=lengths)
        packed_output, hidden = self.gru(pps_in, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        result, pps_out_size = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(result)
        print(f" fc output: {fc_output.size()}")
        import ipdb; ipdb.set_trace()
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

In [60]:
# classifier = RNNClassifier()
classifier = ModelX()
arr, _ = str2ascii_arr('adylov')
inp = torch.tensor([arr], dtype=torch.long)
out = classifier(inp)
print(f"\nin: {inp.size()}, \nout: {out.size()}")

 input size: torch.Size([1, 6])
 embeddding size: torch.Size([1, 6, 5])
 gru hidden output: torch.Size([1, 1, 5])
 fc output: torch.Size([1, 6, 18])
> [0;32m<ipython-input-58-b2aa8ba28e34>[0m(46)[0;36mforward[0;34m()[0m
[0;32m     45 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 46 [0;31m        [0;32mreturn[0m [0mfc_output[0m[0;34m[0m[0m
[0m[0;32m     47 [0;31m[0;34m[0m[0m
[0m
ipdb> fc_output
tensor([[[-0.2341, -0.1585,  0.3301,  0.4763, -0.4598,  0.4107,  0.3497,
           0.0203,  0.2602,  0.2620,  0.1860, -0.1730,  0.1561, -0.3810,
           0.2841, -0.1522,  0.0272,  0.1510],
         [-0.3585, -0.2016,  0.4147,  0.2924, -0.5790,  0.3416,  0.4077,
          -0.0244,  0.5812,  0.2951,  0.2578, -0.2083,  0.2215, -0.2899,
           0.4685, -0.0249,  0.1650, -0.1136],
         [-0.1940, -0.1583,  0.5266,  0.4613, -0.6034,  0.4934,  0.2155,
          -0.1

BdbQuit: 

In [57]:
my_view = out.view(-1, 18)
print(my_view.size())
print(my_view)

torch.Size([6, 18])
tensor([[ 0.2650,  0.1078,  0.4122, -0.4748, -0.2397, -0.0794,  0.4312, -0.0862,
          0.2040,  0.0309,  0.2786, -0.0256, -0.2849,  0.3879,  0.4688, -0.2300,
          0.3165,  0.3080],
        [ 0.2847,  0.1856,  0.6050, -0.5111, -0.1214, -0.1743,  0.5501, -0.1887,
          0.2534, -0.1171,  0.4144,  0.0651, -0.3758,  0.4415,  0.5154, -0.2968,
          0.1395,  0.3997],
        [ 0.2395,  0.1942,  0.5395, -0.2764, -0.3767, -0.1780,  0.2264, -0.2316,
          0.0061, -0.1115,  0.1503,  0.0484, -0.3791,  0.3653,  0.5168, -0.0732,
          0.2596,  0.3354],
        [ 0.3090,  0.1471,  0.4178, -0.2902, -0.4253, -0.2231,  0.1582, -0.2118,
          0.0295, -0.0431,  0.2183,  0.0966, -0.4682,  0.3416,  0.3960, -0.1166,
          0.4257,  0.4236],
        [ 0.2306,  0.1888,  0.4193, -0.2122, -0.5594, -0.2429, -0.0150, -0.2258,
         -0.0891, -0.0667,  0.0550,  0.0142, -0.4226,  0.2745,  0.3441, -0.0214,
          0.4250,  0.3577],
        [-0.0126,  0.3229,  0.

In [45]:
names = ['adylov', 'solan', 'hard', 'san']
# classifier = RNNClassifier()
classifier = ModelX()
inputs = make_var(names)
out = classifier(inputs)
print(f"\nbatch in: {inputs.size()}, \nbatch out: {out.size()}")

 input size: torch.Size([4, 6])
 embeddding size: torch.Size([4, 6, 5])
 gru hidden output: torch.Size([1, 4, 5])
 fc output: torch.Size([4, 6, 18])

batch in: torch.Size([4, 6]), 
batch out: torch.Size([4, 6, 18])


In [28]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size=256, hidden_size=256, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

# Dataset

In [29]:
class NameDataSet(Dataset):
    def __init__(self, filename='names_train.csv'):
        trainset = pd.read_csv(filename, header=None)
        trainset.columns = ['name', 'country']
        countries = sorted(list(trainset.country.drop_duplicates()))

        self.trainset = trainset
        self.countries = countries
        self.len = len(trainset)        

    def __getitem__(self, index):
        country = self.trainset.iloc[index]['country']
        return self.trainset.iloc[index]['name'], self.countries.index(country)

    def __len__(self):
        return self.len
        

In [30]:
train_dataset = NameDataSet(filename='../lessons/names_train.csv')
test_dataset = NameDataSet(filename='../lessons/names_test.csv')

In [31]:
%%time
train_loader = DataLoader(dataset=train_dataset, sampler=ImbalancedDatasetSampler(train_dataset), batch_size=2, num_workers=2) # 2 * 9 * 743 
test_loader = DataLoader(dataset=test_dataset, sampler=ImbalancedDatasetSampler(test_dataset), batch_size=2, num_workers=2) # 4 * 25 * 67


CPU times: user 8.79 s, sys: 0 ns, total: 8.79 s
Wall time: 8.8 s


# 1. Model

In [53]:
# model = RNNClassifier().to(DEVICE)

In [46]:
model = ModelX().to(DEVICE)

# 2. Criterion & Loss

In [47]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [48]:
for epoch in tqdm(range(1, 1 + 1)):
    train(model, DEVICE, train_loader, optimizer, epoch, criterion)
    test(model, DEVICE, test_loader, criterion)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

 input size: torch.Size([2, 5])
 embeddding size: torch.Size([2, 5, 5])
 gru hidden output: torch.Size([1, 2, 5])
 fc output: torch.Size([2, 5, 18])
> [0;32m<ipython-input-42-74fa70e5fbdc>[0m(18)[0;36mtrain[0;34m()[0m
[0;32m     17 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 18 [0;31m        [0mtmp[0m [0;34m=[0m [0moutput[0m[0;34m.[0m[0mview[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mCOUNTRY_LENGTH[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0mloss[0m [0;34m=[0m [0mcriterion[0m[0;34m([0m[0mtmp[0m[0;34m,[0m [0mtarget[0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> output
tensor([[[ 3.1571e-01,  4.3976e-04,  2.6814e-02,  2.3949e-01,  5.0734e-01,
          -3.2914e-01, -1.1811e-01, -5.1694e-01,  3.0503e-01, -2.9958e-01,
           2.1942e-01, -2.6824e-01,  3.7333e-01,  5.2506e-01, -2.1306e-01,
           1.3651e-02, -5.852

BdbQuit: 