## Import Libraries

In [41]:
# torch
import torch
from torch import nn
from torch.optim import Adam

# visualization
import matplotlib.pyplot as plt

# data processing
import numpy as np
import string
import unicodedata
import random

# utils
import os
from tqdm import tqdm

In [162]:
# constant
BATCH_SIZE = 64
EPOCH = 10000
LEARNING_RATE = 0.001
TEST_SIZE = 0.2
DATA_PATH = './data/names/'
ALL_LETTERS = string.ascii_letters + '.,;'
DEVICE = torch.device('cuda' if torch.cuda.is_available else 'cpu')

## Data Preparation

In [43]:
# data loader
def load_data(path):

    def to_ascii(text):
        return ''.join(c for c in unicodedata.normalize('NFD', text) \
            if unicodedata.category(c) != 'Mn' and c in ALL_LETTERS)

    files = os.listdir(path)
    names, labels = [], []   # declare empty list for X and y
    all_labels = []

    for f in files:
        label = f.split('.')[0]
        all_labels.append(label)

        with open(path + f, encoding='utf-8') as r:
            lines = r.read().strip().split('\n')    # read each record
            
            names.extend([to_ascii(l) for l in lines])
            labels.extend([label for _ in range(len(lines))])    

    # convert to numpy
    names = np.array(names)
    labels = np.array(labels)

    # shuffle data
    index = np.arange(len(labels))
    np.random.shuffle(index)
    names = names[index]
    labels = labels[index]

    return names, labels, all_labels

In [44]:
X, y, all_labels = load_data(DATA_PATH)
X[:5], y[:5]

(array(['Kouma', 'Makhmudov', 'Kabyshev', 'Zimitsky', 'Spyridis'],
       dtype='<U19'),
 array(['English', 'Russian', 'Russian', 'Russian', 'Greek'], dtype='<U10'))

In [165]:
# data encoder
def onehot_encode(data, device=None):

    def _char_encode(char):
        return ALL_LETTERS.index(char)
    
    def _line_encode(line):
        return np.array([_char_encode(c) for c in line])

    def single_onehot_encode(s, device=None):
        onehot = torch.zeros((len(s), len(ALL_LETTERS)), dtype=torch.float32, device=device) \
            if device else torch.zeros((len(s), len(ALL_LETTERS)), dtype=torch.float32)
        
        for i,char in enumerate(s):
            onehot[i][_char_encode(char)] = 1
                        
        return onehot

    
    # all_onehot = []
    # for d in data:
    #     onehot = single_onehot_encode(d)        
    #     all_onehot.append(onehot)

    return single_onehot_encode(data, device)

In [166]:
enc = onehot_encode('Affandy')
enc

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

## Batch

In [112]:
def train_test_split(x, y, test_size=.2):
    npx = np.array(x)
    npy = np.array(y)
    # get test index
    test_id = np.random.choice(range(len(y)), int(test_size * len(y)))
    # test data
    x_test = npx[np.isin(np.arange(len(y)), test_id)]
    y_test = npy[np.isin(np.arange(len(y)), test_id)]
    # train data
    x_train = npx[~np.isin(np.arange(len(y)), test_id)]
    y_train = npy[~np.isin(np.arange(len(y)), test_id)]
    
    return zip(x_train, y_train), zip(x_test, y_test)

def train_test_split_index(x, y, test_size=.2):
    return np.random.choice(range(len(y)), int(test_size * len(y)))

def get_batches(arr, batch_size):

    feature, label = arr
    # feature = onehot_encode(feature)
    # label = torch.tensor([all_labels.index()])

    for i in range(0, len(feature), batch_size):
        x = feature[i:i+batch_size]
        y = label[i:i+batch_size]

        yield x, y

In [53]:
train, test = train_test_split(X, y, test_size=.2)

In [37]:
batch = get_batches(train, 64)

In [40]:
tx, ty = next(batch)
type(tx[:5]), type(ty[:5])

(list, numpy.ndarray)

## Utils

In [59]:
def output_to_label(output):
    label_id = torch.argmax(output).item()
    return all_labels[label_id]

## RNN Network

In [167]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        # from input layer to hidden layer
        self.layer_hidden = nn.Linear(input_size + hidden_size, hidden_size)
        # from input layer to output layer
        self.layer_output = nn.Linear(input_size + hidden_size, output_size)
        # softmax activation since we want multiclass classification
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_tensor, hidden_tensor):
        # combine input and hidden
        combined = torch.cat((input_tensor, hidden_tensor), dim=1)

        # forward pass to hidden layer
        hidden = self.layer_hidden(combined)
        # forward pass to output layer
        output = self.layer_output(combined)
        # apply softmax to output
        output = self.softmax(output)

        return output, hidden

    def init_hidden(self, device=None):
        # hidden state weight initialization        
        return torch.zeros((1, self.hidden_size), device=device) if device else torch.zeros(1, self.hidden_size)

## Training

In [177]:
def training(model, train_data, val_data, device, epochs=1000, learning_rate=0.005, print_every=100):    
    # set model to train mode
    model.train()
    # define optimizer
    optimizer = Adam(model.parameters(), lr=learning_rate)
    # define loss function
    criterion = nn.NLLLoss()

    # move to device (CPU/GPU)
    model.to(device)

    # define loss and acc
    train_loss, val_loss = [], []
    train_acc, val_acc = [], []

    # validation loss minimum
    val_loss_min = np.Inf

    for e in tqdm(range(epochs), ncols=75, desc='Training'):

        ####################
        # Training Section #
        ####################

        # define local loss and acc
        tloss = 0
        tacc = []

        # assume we're using batch_size=1
        for feature, target in train_data:
            # init hidden state
            hidden = model.init_hidden(device)      

            # transform dataset
            feature = onehot_encode(feature, device)
            target = torch.tensor([all_labels.index(target)], dtype=torch.long, device=device)

            # move to device
            # feature, target = feature.to(device), target.to(device)            
            # reset optimizer
            optimizer.zero_grad()
            
            # forward pass for each character sequences
            for row in feature:
                # add dimension to feature (in order to concat with hidden tensor)
                row = row.view(1,-1)
                output, hidden = model(row, hidden)
            
            # compute loss
            loss = criterion(output, target)
            # record loss
            tloss += loss.item()

            # compute accuracy
            predicted = output_to_label(output)
            tacc.append(True if predicted == target else False)

            # backpropagation
            loss.backward()
            # update weights
            optimizer.step()

        # record to global loss and accuracy
        train_loss.append(tloss/len(train_data))
        train_acc.append(np.mean(tacc))


        ######################
        # Validation Section #
        ######################

        # define local loss and acc
        vloss = 0
        vacc = []

        # set model to eval mode
        model.eval()

        # we turned off gradient on validation mode
        with torch.no_grad():
            # assume batch_size=1
            for feature, target in val_data:
                # init hidden state
                hidden = model.init_hidden(device)      

                # transform dataset
                feature = onehot_encode(feature, device)
                target = torch.tensor([all_labels.index(target)], dtype=torch.long, device=device)

                # move to device
                feature, target = feature.to(device), target.to(device)            
                # reset optimizer
                optimizer.zero_grad()
                
                # forward pass for each character sequences
                for row in feature:
                    output, hidden = model(row, hidden)

                # compute loss
                loss = criterion(output, target)
                # record loss
                vloss += loss.item()
                
                # compute accuracy
                predicted = output_to_label(output)
                vacc.append(True if predicted == target else False)

            # record to global loss and accuracy
            val_loss.append(vloss/len(val_data))
            val_acc.append(np.mean(vacc))
        
        # reset model to train mode
        model.train()

        # print epoch
        if (e+1) % print_every == 0:
            print(f"Loss: {tloss/len(train_data)} - Val Loss: {vloss/len(val_data)} Acc: {np.mean(vacc)}")
                
        # save model if validation loss decrease
        if vloss/len(val_data) <= val_loss_min:            
            torch.save(model.state_dict(), 'rnn.pt')
        else:
            print(f'Validation loss not improving ({val_loss_min:.6f} --> {vloss/len(val_data)})')

    return train_loss, val_loss, train_acc, val_acc

In [178]:
# load dataset
X, y, all_labels = load_data(DATA_PATH)
# split into train and test
train, test = train_test_split(X, y)
# split into train and val with 8:2 ratio
train, val = train_test_split(X, y, .2)

# define network
hidden_size = 128
rnn = RNN(len(ALL_LETTERS), hidden_size, len(all_labels))

In [180]:
# train
training(rnn, train, val, DEVICE)

Training:   0%|                                   | 0/1000 [01:21<?, ?it/s]


TypeError: object of type 'zip' has no len()