# README
Do not blindly copy and paste. The parameter is hard-fixed with the `dataset`.<br>
For example: `SEQUENCE_LENGTH`

In [53]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1249583)
SEQUENCE_LENGTH = 19 # The longest name is 19. Go down to see the details

# Data Preparation

<img src='lesson13_data.png'>

In [2]:
def str2ascii_arr(name):
    """
    0-255
    """
    arr = [ord(c) for c in name]
    return arr, len(arr)

In [81]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size=256, hidden_size=256, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=256, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return hidden.clone().detach()

In [4]:
# in torch.Size([1, 6]) 'adylov'
# out torch.Size([1, 1, 18]) 18 countries

# Zero padding

<img src='zero_padding.png'>

In [44]:
def pad_sequences(vectorized_seqs, seq_lengths):
    """
    Let the `SEQUENCE_LENGTH` is 19. According to the dataset
    """
    seq_tensor = torch.zeros((len(vectorized_seqs), SEQUENCE_LENGTH), dtype=torch.long)
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.tensor(seq, dtype=torch.long)
    return seq_tensor

In [45]:
def make_variables(names):
    sequence_and_length = [str2ascii_arr(name) for name in names]
    vectorized_seqs = [sl[0] for sl in sequence_and_length]
    seq_lengths = torch.tensor([sl[1] for sl in sequence_and_length])
    return pad_sequences(vectorized_seqs, seq_lengths)

In [46]:
make_variables(['az', 'ab '])

tensor([[ 97, 122,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0],
        [ 97,  98,  32,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0]])

In [47]:
classifier = RNNClassifier()
arr, _ = str2ascii_arr('adylov')
inp = torch.tensor([arr], dtype=torch.long)
out = classifier(inp)
print(f"\nin: {inp.size()}, \nout: {out.size()}")

 input size: torch.Size([6, 1])
 embeddding size: torch.Size([6, 1, 256])
 gru hidden output: torch.Size([1, 1, 256])
 fc output: torch.Size([1, 1, 18])

in: torch.Size([1, 6]), 
out: torch.Size([1, 1, 18])




In [48]:
names = ['adylov', 'solan', 'hard', 'san']
classifier = RNNClassifier()
inputs = make_variables(names)
out = classifier(inputs)
print(f"\nbatch in: {inputs.size()}, \nbatch out: {out.size()}")

 input size: torch.Size([19, 4])
 embeddding size: torch.Size([19, 4, 256])
 gru hidden output: torch.Size([1, 4, 256])
 fc output: torch.Size([1, 4, 18])

batch in: torch.Size([4, 19]), 
batch out: torch.Size([1, 4, 18])




# Utilities

In [62]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    print(cm)
    plt.figure(figsize=(10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        import ipdb; ipdb.set_trace()
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    y_test = []
    y_pred = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            y_test.append(int(target))
            y_pred.append(int(pred))
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    # Confusion matrix
    confusion_mtx = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(confusion_mtx, classes=[i for i in range(1, 10 + 1)], normalize=True,
                          title='Normalized confusion matrix')


# Dataset

In [49]:
trainset = pd.read_csv('names_train.csv', header=None)
testset = pd.read_csv('names_test.csv', header=None) 

In [50]:
headers = ['name', 'country']
trainset.columns = headers
testset.columns = headers

In [51]:
countries = list(trainset.country.drop_duplicates())

In [130]:
# Majority of dataset is `Russian`
trainset.country.value_counts()

Russian       6272
English       2445
Arabic        1333
Japanese       660
German         482
Italian        472
Czech          346
Spanish        198
Dutch          198
French         184
Chinese        178
Irish          154
Greek          135
Polish          92
Scottish        66
Korean          62
Portuguese      49
Vietnamese      48
Name: country, dtype: int64

In [131]:
# So as trainset
testset.country.value_counts()

Russian       3136
English       1223
Arabic         667
Japanese       331
German         242
Italian        237
Czech          173
Spanish        100
Dutch           99
French          93
Chinese         90
Irish           78
Greek           68
Polish          47
Scottish        34
Korean          32
Vietnamese      25
Portuguese      25
Name: country, dtype: int64

In [154]:
trainset.iloc[0]['country']

'Czech'

# Find the longest name in the dataset

In [15]:
result = pd.concat([trainset, testset])

In [20]:
result['name_length'] = result.name.apply(lambda x:  len(x))

## Longest name is 19 chars
19 is the `sequence_length`

In [32]:
result['name_length'].max(), result['name_length'].idxmax()

(19, 7925)

In [34]:
result.iloc[7925]

name           ShirinskyShikhmatov
country                    Russian
name_length                     19
Name: 7925, dtype: object

In [54]:
class NameDataSet(Dataset):
    def __init__(self, filename='names_train.csv'):
        trainset = pd.read_csv('names_train.csv', header=None)
        trainset.columns = ['name', 'country']
        countries = list(trainset.country.drop_duplicates())

        self.trainset = trainset
        self.countries = countries
        self.len = len(trainset)        

    def __getitem__(self, index):
        country = self.trainset.iloc[index]['country']
        return self.trainset.iloc[index]['name'], self.countries.index(country)

    def __len__(self):
        return self.len
        

In [55]:
train_dataset = NameDataSet()
test_dataset = NameDataSet('names_test.csv')

In [177]:
train_dataset.countries.index('Czech')

0

In [56]:
train_loader = DataLoader(dataset=train_dataset, batch_size=2, num_workers=2, shuffle=True) # 2 * 9 * 743 
test_loader = DataLoader(dataset=test_dataset, batch_size=2, num_workers=2) # 4 * 25 * 67

# 1. Model

In [58]:
model = RNNClassifier()

# 2. Criterion & Loss
Loss is absorbed in the `test() and train()` already

In [59]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [63]:
for epoch in range(1, 1 + 1):
    train(model, 'cpu', train_loader, optimizer, epoch)
    test(model, 'cpu', test_loader)

> [0;32m<ipython-input-62-7472f32cb077>[0m(38)[0;36mtrain[0;34m()[0m
[0;32m     37 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 38 [0;31m        [0mdata[0m[0;34m,[0m [0mtarget[0m [0;34m=[0m [0mdata[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m,[0m [0mtarget[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     39 [0;31m        [0moptimizer[0m[0;34m.[0m[0mzero_grad[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> list
[1;32m     33 [0m[0;34m[0m[0m
[1;32m     34 [0m[0;32mdef[0m [0mtrain[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0mdevice[0m[0;34m,[0m [0mtrain_loader[0m[0;34m,[0m [0moptimizer[0m[0;34m,[0m [0mepoch[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[1;32m     35 [0m    [0mmodel[0m[0;34m.[0m[0mtrain[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[1;32m   



 input size: torch.Size([19, 2])
 embeddding size: torch.Size([19, 2, 256])
 gru hidden output: torch.Size([1, 2, 256])
 fc output: torch.Size([1, 2, 18])
tensor([[[ 9.7740e-02,  1.5666e-01,  1.7878e-01, -1.2598e-01, -1.4794e-01,
           6.2180e-01,  4.0388e-01,  3.5538e-04, -5.3939e-02, -4.1176e-01,
          -2.2481e-01, -2.5967e-01, -4.4660e-01,  1.3092e-01,  1.4512e-01,
          -2.8003e-01, -1.3420e-01,  1.1959e-01],
         [ 1.1031e-01,  1.4857e-01,  1.8466e-01, -1.2092e-01, -1.6279e-01,
           6.2509e-01,  3.8197e-01, -2.0486e-02, -7.2826e-02, -4.1754e-01,
          -2.4247e-01, -2.4385e-01, -4.4459e-01,  1.1956e-01,  1.4070e-01,
          -2.8952e-01, -1.1608e-01,  1.0364e-01]]], grad_fn=<AddBackward0>)
ipdb> q


BdbQuit: 