# Char Level RNN Classification

In this notebook, we will use RNN to do classifications

### Data Acquisition and Preprocessing

In [8]:
from utils import download_file_from_url,extract_zip

download_file_from_url("https://download.pytorch.org/tutorial/data.zip","./name-data.zip")
extract_zip("./name-data.zip","./data/name-data",remove_zip=True)
# I also added Turkish.txt to data/name-data 
# from https://gist.github.com/yasinkuyu/c5d18a3c2221f6ac75dc0b8fd81317e9

name-data.zip: 100%|██████████| 2.75M/2.75M [00:00<00:00, 4.62MB/s]
Extracting name-data.zip: 100%|██████████| 21/21 [00:00<00:00, 982.31files/s]


'/Users/huseyin/Codes/jarvis.dl/data/name-data/data'

In [72]:
import os
names_folder_path = './data/name-data/data/names'

data = {}
for filename in os.listdir(names_folder_path):
    # Get the full path
    full_path = os.path.join(names_folder_path, filename)
    country = filename.split('.')[0]
    with open(full_path, 'r') as f:
        lines = f.readlines()
    data[country] = [name.strip() for name in lines]        


In [73]:
# We need to convert dataset to torch dataset
classes = list(data.keys())
class_idx = {classes[i]:i for i in range(len(classes))}
idx_class = {i:classes[i] for i in range(len(classes))}


import string
import unicodedata

# Let's define vocabulary we will use char-rnn
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

In [74]:
def letterToIndex(letter):
    # return our out-of-vocabulary character if we encounter a letter unknown to our model
    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [75]:
# Dataset Implementation
# Dataset must implement three functions: __init__, __len__, and __getitem__
from torch.utils.data import Dataset
import torch
class NamesDataset(Dataset):
    def __init__(self, data, label_to_idx):
        # Data is a dictionary of country names and their corresponding names
        self.data = []
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []
        
        for country, names in data.items():
            for name in names:
                self.data.append(name)
                self.labels.append(country)
                self.data_tensors.append(lineToTensor(name))
                self.labels_tensors.append(torch.tensor(label_to_idx[country]))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data_tensors[idx], self.labels_tensors[idx], self.data[idx], self.labels[idx]

In [76]:
alldata = NamesDataset(data, class_idx)
alldata[0]

(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]]]),
 tensor(0),
 'Abl',
 'Czech')

In [77]:
train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator().manual_seed(2024))
len(train_set), len(test_set)

(17438, 3077)

In [78]:
class CharRNN(torch.nn.Module):
    def __init__(self,inp_size,hidden_size,out_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = torch.nn.RNN(inp_size,hidden_size)
        self.linear = torch.nn.Linear(hidden_size,out_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)
    
    def forward(self, line_tensor):
        output, hidden = self.rnn(line_tensor)
        output = self.linear(hidden[0])
        output = self.softmax(output)
        return output, hidden

In [79]:
n_hidden = 128
rnn = CharRNN(n_letters, n_hidden, len(data.keys()))
print(rnn)

CharRNN(
  (rnn): RNN(58, 128)
  (linear): Linear(in_features=128, out_features=19, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [80]:
def label_from_output(output, output_labels):
    top_n, top_i = output.topk(1)
    label_i = top_i[0].item()
    return output_labels[label_i], label_i

def convert_label_to_tensor(label,category_length):
    tensor = torch.zeros(category_length)
    tensor[label] = 1
    return tensor
    


input = lineToTensor('Albert')
print(input.shape)
output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``
print(output)
print(label_from_output(output[0], list(data.keys())))

torch.Size([6, 1, 58])
(tensor([[-2.9313, -2.9050, -3.0032, -2.7818, -2.9181, -2.9322, -3.1087, -2.9764,
         -2.8857, -2.8481, -2.9870, -3.0464, -3.0157, -2.8794, -3.1065, -2.9592,
         -2.9710, -2.9164, -2.8385]], grad_fn=<LogSoftmaxBackward0>), tensor([[[-0.0049,  0.0354,  0.0864, -0.0963,  0.0417,  0.0207,  0.0479,
          -0.1172,  0.0228, -0.1967, -0.0769,  0.0298,  0.1916,  0.0879,
          -0.0467, -0.0052,  0.1995,  0.0762, -0.0122,  0.0893, -0.0424,
          -0.0636,  0.1014, -0.2039,  0.0206, -0.1846,  0.0446, -0.1265,
          -0.1687, -0.2217, -0.0798, -0.1206,  0.1315,  0.0533, -0.1355,
          -0.0836, -0.0971,  0.0991, -0.0415,  0.0306,  0.0632, -0.0645,
          -0.0392, -0.0295,  0.1577, -0.0911,  0.1218,  0.0046, -0.0860,
          -0.0955,  0.0411,  0.0951, -0.0277, -0.0422,  0.1887, -0.0137,
           0.0690, -0.2074,  0.0610, -0.0022, -0.0654,  0.0979, -0.1816,
           0.1408,  0.0958, -0.1714,  0.0281, -0.0323, -0.0069, -0.1634,
          -0.1

In [81]:
import numpy as np
from tqdm import tqdm
def train(rnn,dataset,criterion = torch.nn.CrossEntropyLoss(),n_epoch = 10, lr = 0.01, batch_size = 64):
    optimizer = torch.optim.SGD(rnn.parameters(),lr)

    all_losses = []
    indices = np.arange(len(dataset))
    np.random.shuffle(indices)
    batches = np.array_split(indices, 101//20)
    
    rnn.train()
    for i in tqdm(range(1,n_epoch+1)):

        current_loss = 0
        rnn.zero_grad()
        for batch in batches:
            batch_loss = torch.tensor(0.0,requires_grad=True)
            for i in batch:
                (data_tensor,label_tensor,data_text,label_text) = dataset[i]
                output = rnn(data_tensor)
                loss = criterion(output[0],convert_label_to_tensor(label_tensor.item(),len(classes)).reshape(1,-1))
                batch_loss = batch_loss + loss.item()
            
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(rnn.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()
            current_loss += batch_loss
        
        all_losses.append(current_loss/(len(batches) * batch_size))
        print("Epoch: ",i,"Loss: ",current_loss/(len(batches) * batch_size))
    
    return all_losses
        


In [82]:
import time
start = time.time()
all_losses = train(rnn, train_set)
end = time.time()
print(f"training took {end-start}s")

 10%|█         | 1/10 [00:01<00:15,  1.70s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 20%|██        | 2/10 [00:03<00:13,  1.71s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 30%|███       | 3/10 [00:05<00:11,  1.70s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 40%|████      | 4/10 [00:06<00:10,  1.67s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 50%|█████     | 5/10 [00:08<00:08,  1.66s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 60%|██████    | 6/10 [00:09<00:06,  1.64s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 70%|███████   | 7/10 [00:11<00:04,  1.63s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 80%|████████  | 8/10 [00:13<00:03,  1.62s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


 90%|█████████ | 9/10 [00:14<00:01,  1.62s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)


100%|██████████| 10/10 [00:16<00:00,  1.65s/it]

Epoch:  7461 Loss:  tensor(160.8722, grad_fn=<DivBackward0>)
training took 16.477602005004883s



