# Implement name classifier
1. With `GPU`
2. With `data parallel`
3. Use `pad-pack`

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from utils import ImbalancedDatasetSampler

torch.manual_seed(1249583)

# See the details in `Dataset` section
SEQUENCE_LENGTH = 19
COUNTRY_LENGTH = 18

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [30]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def train(model, device, train_loader, optimizer, epoch, criterion):
    """
    This function has one line different from the ordinary `train()` function
    It has `make_variables()` to convert tuple of names to be a tensor
    """
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        import ipdb; ipdb.set_trace()
        # Do not forget to convert the tuple of string to a tensor
        data = make_variables(data)
        
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        tmp = output.view(-1, COUNTRY_LENGTH)
        loss = criterion(tmp, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    y_test = []
    y_pred = []
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            data = make_variables(data)
            data, target = data.to(device), target.to(device)
            output = model(data)
            tmp = output.view(-1, COUNTRY_LENGTH)
            
            test_loss += criterion(tmp, target).item() # sum up batch loss
            pred = tmp.max(1, keepdim=True)[1] # get the index of the max log-probability

            pred_tmp = pred.view(-1)
            pred_list = pred_tmp.tolist()
            target_list = target.tolist()
            
            y_test += target_list
            y_pred += pred_list
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    # Confusion matrix
    confusion_mtx = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(confusion_mtx, classes=countries, normalize=True,
                          title='Confusion matrix')


In [5]:
def str2ascii_arr(name):
    """
    0-255
    """
    arr = [ord(c) for c in name]
    return arr, len(arr)

In [88]:
def make_var(names):
    ans = []
    for name in names:
        name = name.lower()
        tmp = [ord(chr) for chr in name]
        tmp = torch.tensor(tmp, dtype=torch.long)
        ans.append(tmp)
    return torch.nn.utils.rnn.pad_sequence(ans, batch_first=True)

In [186]:
def count_non_zero_length(aaa):
    """
    aaa = [[116, 105, 114,  97, 115],
         [ 97, 110, 110,   0,   0],
         [101, 108,   0,   0,   0]]
    ans -> [5, 3, 2]
    """
    bbb = []
    for item in aaa:
        counting = 0
        for element in item:
            if element != 0:
                counting += 1
        bbb.append(counting)
    return bbb

In [189]:
def ordered_batch(names: typing.List[str]) -> (torch.tensor, typing.List[int]):
    dummy = make_var(sorted(names, key=len, reverse=True))    
    lengths = count_non_zero_length(dummy)
    return dummy, lengths

In [232]:
tmp = ordered_batch(['Tiras', 'Ann', 'El'])
tmp

(tensor([[116, 105, 114,  97, 115],
         [ 97, 110, 110,   0,   0],
         [101, 108,   0,   0,   0]]), [5, 3, 2])

In [233]:
emb = nn.Embedding(128, 5)

In [234]:
embedding = emb(tmp[0])
embedding

tensor([[[ 0.0691,  0.8688, -0.8318, -0.3382,  1.5913],
         [ 0.0824, -1.3893, -0.3919,  0.0882,  2.4386],
         [ 0.6078,  1.4944, -0.0480, -1.1164, -0.9413],
         [ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
         [-0.4380, -0.1014,  0.3501,  0.3039,  0.4968]],

        [[ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
         [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
         [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
         [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
         [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631]],

        [[ 0.7937,  0.2086,  1.3902,  0.3604,  0.5389],
         [-0.2067,  0.2184,  0.5462, -0.4819,  1.3783],
         [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
         [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
         [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631]]],
       grad_fn=<EmbeddingBackward>)

In [235]:
embedding.size()

torch.Size([3, 5, 5])

In [236]:
from pprint import pprint

pps_in = torch.nn.utils.rnn.pack_padded_sequence(embedding, batch_first=True, lengths=[5,4,3])
pps_in

PackedSequence(data=tensor([[ 0.0691,  0.8688, -0.8318, -0.3382,  1.5913],
        [ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
        [ 0.7937,  0.2086,  1.3902,  0.3604,  0.5389],
        [ 0.0824, -1.3893, -0.3919,  0.0882,  2.4386],
        [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
        [-0.2067,  0.2184,  0.5462, -0.4819,  1.3783],
        [ 0.6078,  1.4944, -0.0480, -1.1164, -0.9413],
        [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
        [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
        [ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
        [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
        [-0.4380, -0.1014,  0.3501,  0.3039,  0.4968]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([3, 3, 3, 2, 1]))

In [237]:
pps_out = torch.nn.utils.rnn.pad_packed_sequence(pps_in, batch_first=True)
pps_out

(tensor([[[ 0.0691,  0.8688, -0.8318, -0.3382,  1.5913],
          [ 0.0824, -1.3893, -0.3919,  0.0882,  2.4386],
          [ 0.6078,  1.4944, -0.0480, -1.1164, -0.9413],
          [ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
          [-0.4380, -0.1014,  0.3501,  0.3039,  0.4968]],
 
         [[ 0.2789,  0.3773, -0.8447, -0.6797,  0.8310],
          [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
          [-1.1416, -2.0250,  1.6152, -0.8831, -0.3554],
          [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
 
         [[ 0.7937,  0.2086,  1.3902,  0.3604,  0.5389],
          [-0.2067,  0.2184,  0.5462, -0.4819,  1.3783],
          [-0.9129, -1.9566,  0.2719,  1.5438,  0.5631],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
        grad_fn=<TransposeBackward0>), tensor([5, 4, 3]))

In [239]:
classifier = ModelX()
out = classifier(pps_in)

AttributeError: 'PackedSequence' object has no attribute 'size'

In [240]:
class ModelX(nn.Module):
    def __init__(self, input_size=256, hidden_size=5, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

## Try `pack_padded_sequence`

In [142]:
classifier = RNNClassifier()
out = classifier(pps)

AttributeError: 'PackedSequence' object has no attribute 'size'

In [242]:
# classifier = RNNClassifier()
classifier = ModelX()
arr, _ = str2ascii_arr('adylov')
inp = torch.tensor([arr], dtype=torch.long)
out = classifier(inp)
print(f"\nin: {inp.size()}, \nout: {out.size()}")

 input size: torch.Size([6, 1])
 embeddding size: torch.Size([6, 1, 5])
 gru hidden output: torch.Size([1, 1, 5])
 fc output: torch.Size([1, 1, 18])

in: torch.Size([1, 6]), 
out: torch.Size([1, 1, 18])


In [243]:
names = ['adylov', 'solan', 'hard', 'san']
# classifier = RNNClassifier()
classifier = ModelX()
inputs = make_var(names)
out = classifier(inputs)
print(f"\nbatch in: {inputs.size()}, \nbatch out: {out.size()}")

 input size: torch.Size([6, 4])
 embeddding size: torch.Size([6, 4, 5])
 gru hidden output: torch.Size([1, 4, 5])
 fc output: torch.Size([1, 4, 18])

batch in: torch.Size([4, 6]), 
batch out: torch.Size([1, 4, 18])


In [110]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size=256, hidden_size=256, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=128, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        # https://python-reference.readthedocs.io/en/latest/docs/str/ASCII.html
        self.embedding = nn.Embedding(128, hidden_size) # embedding_dim MUST matches with GRU's input_size 
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        """
        Do not remove `print`. Leave it be a historical footprint for I myself in the future
        """
        
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        USE_CUDA = torch.cuda.is_available()
        DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
        return hidden.clone().detach().to(DEVICE)

# Dataset

In [23]:
class NameDataSet(Dataset):
    def __init__(self, filename='names_train.csv'):
        trainset = pd.read_csv(filename, header=None)
        trainset.columns = ['name', 'country']
        countries = sorted(list(trainset.country.drop_duplicates()))

        self.trainset = trainset
        self.countries = countries
        self.len = len(trainset)        

    def __getitem__(self, index):
        country = self.trainset.iloc[index]['country']
        return self.trainset.iloc[index]['name'], self.countries.index(country)

    def __len__(self):
        return self.len
        

In [24]:
train_dataset = NameDataSet(filename='../lessons/names_train.csv')
test_dataset = NameDataSet(filename='../lessons/names_test.csv')

In [25]:
train_loader = DataLoader(dataset=train_dataset, sampler=ImbalancedDatasetSampler(train_dataset), batch_size=2, num_workers=2) # 2 * 9 * 743 
test_loader = DataLoader(dataset=test_dataset, sampler=ImbalancedDatasetSampler(test_dataset), batch_size=2, num_workers=2) # 4 * 25 * 67


# 1. Model

In [26]:
model = RNNClassifier().to(DEVICE)

# 2. Criterion & Loss

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
for epoch in tqdm(range(1, 1 + 1)):
    train(model, DEVICE, train_loader, optimizer, epoch, criterion)
    test(model, DEVICE, test_loader, criterion)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

> [0;32m<ipython-input-30-9e3556e0b15a>[0m(15)[0;36mtrain[0;34m()[0m
[0;32m     14 [0;31m        [0;31m# Do not forget to convert the tuple of string to a tensor[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m        [0mdata[0m [0;34m=[0m [0mmake_variables[0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m
ipdb> data
('Miller', 'Cai')
ipdb> target
tensor([15,  1])
ipdb> q


BdbQuit: 