## [NLP FROM SCRATCH: CLASSIFYING NAMES WITH A CHARACTER-LEVEL RNN](https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#nlp-from-scratch-classifying-names-with-a-character-level-rnn)

##### We will be building and training a basic character-level RNN to classify words. This tutorial, along with the following two, show how to do preprocess data for NLP modeling “from scratch”, in particular not using many of the convenience functions of torchtext, so you can see how preprocessing for NLP modeling works at a low level.

##### A character-level RNN reads words as a series of characters - outputting a prediction and “hidden state” at each step, feeding its previous hidden state into each next step. We take the final prediction to be the output, i.e. which class the word belongs to.

#### Specifically, we’ll train on a few thousand surnames from 18 languages of origin, and predict which language a name is from based on the spelling:

In [None]:
from glob import glob

In [None]:
import string

In [None]:
from tqdm import tqdm
import urllib
from zipfile import ZipFile
import os

In [None]:
url = "https://download.pytorch.org/tutorial/data.zip"

In [None]:
home = os.environ['HOME']
data_dir = f"{home}/torch/"
tar_file = data_dir + url.split('/')[-1]

In [None]:
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

In [None]:
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

In [None]:
with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=tar_file) as t:
    urllib.request.urlretrieve(url=url, filename=tar_file, reporthook=t.update_to)

In [None]:

with ZipFile(tar_file, "r") as zip:
    zip.extractall(data_dir)

In [None]:
for r, d, files in os.walk(data_dir):
    print(r, d, files)

In [None]:
glob(data_dir+"data/names/*.txt")

In [None]:
all_letters = string.ascii_letters +" .,;'"

In [None]:
n_letters =  len(all_letters)

In [None]:
n_letters

In [None]:
import unicodedata

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [None]:
unicodeToAscii('Ślusàrski')

In [None]:
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [None]:
category_lines = {}
all_categories = []

In [None]:
def findFiles(path): return glob(path)

In [None]:
for filename in findFiles('/home/drclab/torch/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

In [None]:
category_lines.keys()

In [None]:
n_categories = len(all_categories)

In [None]:
n_categories

_____

To represent a single letter, we use a “one-hot vector” of size <1 x n_letters>. A one-hot vector is filled with 0s except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.

To make a word we join a bunch of those into a 2D matrix <line_length x 1 x n_letters>.

In [None]:
all_letters.find('x')

In [None]:
def letterToIndex(letter):
    return all_letters.find(letter)

In [None]:
import torch

In [None]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

In [None]:
letterToTensor('b')

In [None]:
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [None]:
lineToTensor('John')

![rnn](https://i.imgur.com/Z2xbySO.png)

In [None]:
import torch.nn as nn

In [None]:
class RNN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim) -> None:
        super(RNN, self).__init__()
        self.hidden_size = hidden_dim
        self.i2h = nn.Linear(in_dim+hidden_dim, hidden_dim)
        self.i2o = nn.Linear(in_dim+hidden_dim, out_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.concat((input, hidden), 1)
        hidden = self.i2h(combined)
        out = self.i2o(combined)
        out = self.softmax(out)
        return out, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_letters

In [None]:
n_categories

In [None]:
n_hidden = 128

In [None]:
input = letterToTensor('A')

In [None]:
input.shape

In [None]:
hidden =  torch.zeros(1, n_hidden)

In [None]:
hidden.shape

In [None]:
torch.concat((input, hidden), 1).shape

In [None]:
rnn = RNN(n_letters, n_hidden, n_categories)

In [None]:
out, next_hidden = rnn(input, hidden)

In [None]:
out

In [None]:
next_hidden

In [None]:
input = lineToTensor("Duan")

In [None]:
input

In [None]:
out, nxt_hidden = rnn(input[0], hidden)

In [None]:
out

In [None]:
nxt_hidden

In [None]:
out.topk(1)[1].item()

In [None]:
def categoryFromOutput(out):
    top_n, top_i = out.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

In [None]:
categoryFromOutput(out)

In [None]:
import random

In [None]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

In [None]:
all_categories

In [None]:
randomChoice(all_categories)

In [None]:
randomChoice(category_lines['Dutch'])

In [None]:
def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

In [None]:
randomTrainingExample()

In [None]:
criterion = nn.NLLLoss()
lr = 0.005

In [89]:
def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        out, hid = rnn(line_tensor[i], hidden)

    loss = criterion(out, category_tensor)

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha= -lr)

    return out, loss.item()

In [90]:
import time
import math

In [91]:
n_iters = 100000
log_every = 5000
plot_every = 1000

In [None]:
current_loss = 0
all_losses = []