## 1.2

Mini Batch Size

Set batch_size to 1000, 2000 and 5000

In [21]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

names = {}
languages = []

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

import torch

def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i.item()
    return languages[category_i], category_i

In [22]:
# create a list of tuples (language, name)

listOfTuples = []
dictNames = names
count = 0
def createTuple():
    for lang, listname in dictNames.items():
        for n in listname:
            tupleLangName = (n, lang)
            listOfTuples.append(tupleLangName)

            
createTuple()


In [23]:
# method to find max length name

import numpy as np

def maxlength(data):
    index = 0
    for i in range(len(data)):
        if len(data[i][0]) > len(data[index][0]):
            index = i
    return len(data[index][0])


In [24]:
import numpy as np

def getBatchOfData(batch_size, data):
    num_batches = int(len(data)/batch_size)
    indices = np.arange(0, len(data))
    np.random.shuffle(indices)
        
    bx = []
    by = []
    batches = []
    eachBatch = []
    num_samples = 0
    
    for index1 in indices:
        num_samples += 1
        eachBatch.append(data[index1])
       
        if num_samples % batch_size == 0:
            batches.append(eachBatch)
            eachBatch = []

    for i in range(len(batches)):
        BatchOfnameTensor = torch.zeros(batch_size, maxlength(batches[i]), n_letters)
        BatchOflanguageTensor = torch.zeros(batch_size, n_categories, dtype=torch.long)
        for j in range(len(batches[i])):
            name_tensor = nameToTensor(batches[i][j][0])
            for k in range(len(name_tensor)):
                BatchOfnameTensor[j][k] = name_tensor[k][0]
                
            category = batches[i][j][1]
            category_tensor = torch.zeros(n_categories, dtype=torch.long)
            category_tensor[languages.index(category)] = 1
            BatchOflanguageTensor[j] = category_tensor
            
        bx.append(BatchOfnameTensor)
        by.append(BatchOflanguageTensor)
        
    return bx, by
    


In [25]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, inputsize, hiddensize, nlayers, outputsize):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(         
            input_size = inputsize,
            hidden_size = hiddensize,         # number of hidden units
            num_layers = nlayers,           # number of layers
            batch_first = True       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
        )
        self.out = nn.Linear(128, n_categories)
    
    def forward(self, x):
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state
        out = self.out(r_out[:, -1, :])
        return out
    


In [26]:

rnn = RNN(n_letters, 128, 1, n_categories)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)
loss_func = nn.CrossEntropyLoss()                       

epochs = 5
batch_size = 1000 # Set to 1000, 2000, 5000

# %% training and testing
for epoch in range(1, epochs+1):
        
    n, l = getBatchOfData(batch_size, listOfTuples)
   
    
    for batch in range(len(n)):
        b_x = n[batch]
        b_y = l[batch]
        
        output = rnn(b_x)                               # rnn output
        loss = loss_func(output, torch.max(b_y, 1)[1])                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients

    out = rnn(b_x)                   # (samples, time_step, input_size)
    pred = torch.max(out, 1)[1].data.numpy().squeeze()
    target = torch.max(b_y, 1)[1].data.numpy().squeeze()
    accuracy = sum(pred == target) / b_y.size(0)

    print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.6f' % accuracy)

print(accuracy)       
        


Epoch:  1 | train loss: 1.9302 | test accuracy: 0.453000
Epoch:  2 | train loss: 1.8525 | test accuracy: 0.471000
Epoch:  3 | train loss: 1.8495 | test accuracy: 0.456000
Epoch:  4 | train loss: 1.8849 | test accuracy: 0.454000
Epoch:  5 | train loss: 1.9058 | test accuracy: 0.457000
0.457
