In [128]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Data Preparation

<img src='lesson13_data.png'>

In [62]:
def str2ascii_arr(name):
    """
    0-255
    """
    arr = [ord(c) for c in name]
    return arr, len(arr)

In [87]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size=256, hidden_size=256, output_size=18, n_layers=1):
        """
        Because word embedding is working with ascii. It has to use `input_size=256, hidden_size=256`
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # input_size 256, hidden_size 256.
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input):
        # Sung Kim run this all at once (over the whole input sequence)
        # input = B x S . size(0) = B
        batch_size = input.size(0)
        
        # input: B x S -- (transpose) --> S x B
        input = input.t()
        
        # Embedding S x B -> S x B x I (embedding size)
        print(f" input size: {input.size()}")
        embedded = self.embedding(input)
        embedded = embedded.clone().detach() # Make new tensor because of `EmbeddingGrad`
        print(f" embeddding size: {embedded.size()}")
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        print(f" gru hidden output: {hidden.size()}")
        
        # Use last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden)
        print(f" fc output: {fc_output.size()}")
        return fc_output
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return torch.tensor(hidden, dtype=torch.float)

In [88]:
# in torch.Size([1, 6]) 'adylov'
# out torch.Size([1, 1, 18]) 18 countries

# Zero padding

<img src='zero_padding.png'>

In [89]:
def pad_sequences(vectorized_seqs, seq_lengths):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max()), dtype=torch.long)
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.tensor(seq, dtype=torch.long)
    return seq_tensor

In [90]:
def make_variables(names):
    sequence_and_length = [str2ascii_arr(name) for name in names]
    vectorized_seqs = [sl[0] for sl in sequence_and_length]
    seq_lengths = torch.tensor([sl[1] for sl in sequence_and_length])
    return pad_sequences(vectorized_seqs, seq_lengths)

In [91]:
make_variables(['az', 'ab'])

tensor([[ 97, 122],
        [ 97,  98]])

In [97]:
classifier = RNNClassifier()
arr, _ = str2ascii_arr('adylov')
inp = torch.tensor([arr], dtype=torch.long)
out = classifier(inp)
print(f"\nin: {inp.size()}, \nout: {out.size()}")

 input size: torch.Size([6, 1])
 embeddding size: torch.Size([6, 1, 256])
 gru hidden output: torch.Size([1, 1, 256])
 fc output: torch.Size([1, 1, 18])

in: torch.Size([1, 6]), 
out: torch.Size([1, 1, 18])




In [100]:
names = ['adylov', 'solan', 'hard', 'san']
classifier = RNNClassifier()
inputs = make_variables(names)
out = classifier(inputs)
print(f"\nbatch in: {inputs.size()}, \nbatch out: {out.size()}")

 input size: torch.Size([6, 4])
 embeddding size: torch.Size([6, 4, 256])
 gru hidden output: torch.Size([1, 4, 256])
 fc output: torch.Size([1, 4, 18])

batch in: torch.Size([4, 6]), 
batch out: torch.Size([1, 4, 18])




# Dataset

In [169]:
trainset = pd.read_csv('names_train.csv', header=None) # 2 * 9 * 743
testset = pd.read_csv('names_test.csv', header=None) # 4 * 25 * 67

In [170]:
headers = ['name', 'country']
trainset.columns = headers
testset.columns = headers

In [171]:
countries = list(trainset.country.drop_duplicates())
countries

['Czech',
 'German',
 'Arabic',
 'Japanese',
 'Chinese',
 'Vietnamese',
 'Russian',
 'French',
 'Irish',
 'English',
 'Spanish',
 'Greek',
 'Italian',
 'Portuguese',
 'Scottish',
 'Dutch',
 'Korean',
 'Polish']

In [130]:
# Majority of dataset is `Russian`
trainset.country.value_counts()

Russian       6272
English       2445
Arabic        1333
Japanese       660
German         482
Italian        472
Czech          346
Spanish        198
Dutch          198
French         184
Chinese        178
Irish          154
Greek          135
Polish          92
Scottish        66
Korean          62
Portuguese      49
Vietnamese      48
Name: country, dtype: int64

In [131]:
# So as trainset
testset.country.value_counts()

Russian       3136
English       1223
Arabic         667
Japanese       331
German         242
Italian        237
Czech          173
Spanish        100
Dutch           99
French          93
Chinese         90
Irish           78
Greek           68
Polish          47
Scottish        34
Korean          32
Vietnamese      25
Portuguese      25
Name: country, dtype: int64

In [154]:
trainset.iloc[0]['country']

'Czech'

In [165]:
class NameDataSet(Dataset):
    def __init__(self, filename='names_train.csv'):
        trainset = pd.read_csv('names_train.csv', header=None)
        trainset.columns = ['name', 'country']
        countries = list(trainset.country.drop_duplicates())

        self.trainset = trainset
        self.countries = countries
        self.len = len(trainset)        

    def __getitem__(self, index):
        country = self.trainset.iloc[index]['country']
        return self.trainset.iloc[index]['name'], self.countries.index(country)

    def __len__(self):
        return self.len
        

In [172]:
train_dataset = NameDataSet()
test_dataset = NameDataSet('names_test.csv')

In [177]:
train_dataset.countries.index('Czech')

0

In [179]:
train_loader = DataLoader(dataset=train_dataset, batch_size=2, num_workers=2, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=4, num_workers=2)