# 1.1 Changhyun Lee DSC 275

In [81]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

names = {}
languages = []

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

import torch

def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i.item()
    return languages[category_i], category_i

In [82]:
# create a list of tuples (language, name)

listOfTuples = []
dictNames = names
count = 0
def createTuple():
    for lang, listname in dictNames.items():
        for n in listname:
            tupleLangName = (n, lang)
            listOfTuples.append(tupleLangName)

            
createTuple()


In [83]:
max_len = 0
for lang, listname in names.items():
    for n in listname:
        if len(n) > max_len:
            max_len = len(n)

#
# import numpy as np
# def maxlength(data):
#     index = 0
#     for i in len(data):
#         if len(data[index][0] < len(data[i][0]))
#             index = i
# return len(data[index][0])


In [84]:
import numpy as np

def data_out(batch_size, data):
    x = []
    y = []
    num_batches = int(len(data)/batch_size)
    indices = np.arange(0, len(data))
    np.random.shuffle(indices)  
    batches = []
    batch_i = []
    sample_count = 0
    
    for index in indices:
        sample_count += 1
        batch_i.append(data[index])
       
        if sample_count % batch_size == 0:
            batches.append(batch_i)
            batch_i = []
    
    for i in range(len(batches)):
        tensor_x = torch.zeros(batch_size, max_len, n_letters)
        tensor_y = torch.zeros(batch_size, n_categories, dtype=torch.long)
        
        for j in range(len(batches[i])):
            name_tensor = nameToTensor(batches[i][j][0])
            
            for k in range(len(name_tensor)):
                tensor_x[j][k] = name_tensor[k][0]
                
            category = batches[i][j][1]
            category_tensor = torch.zeros(n_categories, dtype=torch.long)
            category_tensor[languages.index(category)] = 1
            tensor_y[j] = category_tensor
            
        x.append(tensor_x)
        y.append(tensor_y)
        
    return x, y
    



In [85]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, inputsize, hiddensize, nlayers, outputsize):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(         
            input_size = inputsize,
            hidden_size = hiddensize,         # number of hidden units
            num_layers = nlayers,           # number of layers
            batch_first = True       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
        )
        self.out = nn.Linear(hiddensize, n_categories)
    
    def forward(self, x):
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state
        out = self.out(r_out[:, -1, :])
        return out
    


# Batch size: 20074

In [86]:

rnn = RNN(n_letters, 128, 1, n_categories)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)
loss_func = nn.CrossEntropyLoss()                       

epochs = 5
batch_size = 20074

# %% training and testing
for epoch in range(1, epochs+1):
    bx, by = data_out(batch_size, listOfTuples)
   
    
    for batch in range(len(bx)):
        x = bx[batch]
        y = by[batch]
        
        output = rnn(x)                                 # rnn output
        loss = loss_func(output, torch.max(y, 1)[1])    # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients

    acc_output = rnn(x)                                 # (samples, time_step, input_size)
    pred_y = torch.max(acc_output, 1)[1].data.numpy().squeeze()
    target = torch.max(y, 1)[1].data.numpy().squeeze()
    accuracy = sum(pred_y == target) / y.size(0)

    print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.6f' % accuracy)
      
        



Epoch:  1 | train loss: 2.8597 | test accuracy: 0.468666
Epoch:  2 | train loss: 2.6500 | test accuracy: 0.468666
Epoch:  3 | train loss: 2.1861 | test accuracy: 0.468666
Epoch:  4 | train loss: 2.0358 | test accuracy: 0.468666
Epoch:  5 | train loss: 1.9546 | test accuracy: 0.468666


In [87]:
# %%
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(languages)
    name = randomChoice(names[category])
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    name_tensor = nameToTensor(name)
    return category, name, category_tensor, name_tensor

# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 20074

# return an output given an input name
def evaluate(name_tensor):
#    hidden = rnn.initHidden()
    output = rnn(name_tensor)[-1]
#    for i in range(name_tensor.size()[0]):
#        output, hidden = rnn(name_tensor[i], hidden)

    return output

def get_accuracy(conf, n_conf, languages, names):
    count_correct = 0
    
    for lang in languages:
        for name in names[lang]:
            name_tensor = nameToTensor(name)
            output = evaluate(name_tensor)
            guess, guess_i = categoryFromOutput(output)
            
            if guess == lang:
                count_correct += 1
                
            category_i = languages.index(lang)
            conf[category_i][guess_i] += 1
    
    return count_correct/n_conf, conf

acc = get_accuracy(confusion, n_confusion, languages, names)

In [88]:
acc[0]

0.46866593603666434