# 2.2 KFold LSTM implementation

# Training set

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 14 18:49:13 2019

@author: changhyunlee
"""

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

names = {}
languages = []

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

import torch

def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i.item()
    return languages[category_i], category_i

# create a list of tuples (language, name)

listOfTuples = []
dictNames = names
count = 0
def createTuple():
    for lang, listname in dictNames.items():
        for n in listname:
            tupleLangName = (n, lang)
            listOfTuples.append(tupleLangName)

            
createTuple()
#%%

#print(names)
#%%
# kfold
import numpy as np 
from sklearn.model_selection import KFold

#prepare cross-validation
kfold = KFold(5, True, 1)

kTest = []
kTrain = []
list1 = []
list2 = []
list3 = []
list4 = []
list5 = []

list10 = []
list20 = []
list30 = []
list40 = []
list50 = []

count = 0
#enumerate splits
#for train, test in kfold.split(listOfTuples):
#    print(len(train))
    
#print()

for train, test in kfold.split(listOfTuples):
    if count == 0:
        for j in test:
            list10.append(listOfTuples[j])
        kTest.append(list10)
        for i in train:
            list1.append(listOfTuples[i])
        kTrain.append(list1) 
        count += 1
    if count == 1:
        for j in test:
            list20.append(listOfTuples[j])
        kTest.append(list20)
        for i in train:
            list2.append(listOfTuples[i])
        kTrain.append(list2) 
        count += 1
    if count == 2:
        for j in test:
            list30.append(listOfTuples[j])
        kTest.append(list30)
        for i in train:
            list3.append(listOfTuples[i])
        kTrain.append(list3) 
        count += 1
    if count == 3:
        for j in test:
            list40.append(listOfTuples[j])
        kTest.append(list40)
        for i in train:
            list4.append(listOfTuples[i])
        kTrain.append(list4) 
        count += 1
    if count == 4:
        for j in test:
            list50.append(listOfTuples[j])
        kTest.append(list50)
        for i in train:
            list5.append(listOfTuples[i])
        kTrain.append(list5) 
        count += 1
#	print(listOfTuples[n])
#print(len(list6))
#print(kTrain[4])
#print(list1[0][1])


#%% method to find max length name

import numpy as np

def max(src):
    index = 0
    for i in range(len(src)):
        if len(src[index][0]) < len(src[i][0]):
            index = i
    return len(src[index][0])


def data_out(batch_size, data):
    x = []
    y = []
    num_batches = int(len(data)/batch_size)
    indices = np.arange(0, len(data))
    np.random.shuffle(indices)  
    batches = []
    batch_i = []
    sample_count = 0
    
    for index in indices:
        sample_count += 1
        batch_i.append(data[index])
       
        if sample_count % batch_size == 0:
            batches.append(batch_i)
            batch_i = []
    
    for i in range(len(batches)):
        max_len = max(batches[i])
        tensor_x = torch.zeros(batch_size, max_len, n_letters)
        tensor_y = torch.zeros(batch_size, n_categories, dtype=torch.long)
        
        for j in range(len(batches[i])):
            name_tensor = nameToTensor(batches[i][j][0])
            
            for k in range(len(name_tensor)):
                tensor_x[j][k] = name_tensor[k][0]
                
            category = batches[i][j][1]
            category_tensor = torch.zeros(n_categories, dtype=torch.long)
            category_tensor[languages.index(category)] = 1
            tensor_y[j] = category_tensor
            
        x.append(tensor_x)
        y.append(tensor_y)
        
    return x, y
    



import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, inputsize, hiddensize, nlayers, outputsize):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         
            input_size = inputsize,
            hidden_size = hiddensize,         # number of hidden units
            num_layers = nlayers,           # number of layers
            batch_first = True       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
        )
        self.out = nn.Linear(128, n_categories)
    
    def forward(self, x):
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state
        out = self.out(r_out[:, -1, :])
        return out
    


rnn = RNN(n_letters, 128, 1, n_categories)
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)
loss_func = nn.CrossEntropyLoss()                       

epochs = 5
batch_size = 1000

# %% training and testing
for i in range(len(kTrain)):
    for epoch in range(1, epochs+1):
        bx, by = data_out(batch_size, kTrain[i])


        for batch in range(len(bx)):
            x = bx[batch]
            y = by[batch]

            output = rnn(x)                                 # rnn output
            loss = loss_func(output, torch.max(y, 1)[1])    # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients

        acc_output = rnn(x)                   # (samples, time_step, input_size)
        pred_y = torch.max(acc_output, 1)[1].data.numpy().squeeze()
        target = torch.max(y, 1)[1].data.numpy().squeeze()
        accuracy = sum(pred_y == target) / y.size(0)

        print("Fold: ", i+1, " | Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.6f' % accuracy)




# %%
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(languages)
    name = randomChoice(names[category])
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    name_tensor = nameToTensor(name)
    return category, name, category_tensor, name_tensor

# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 20074

# return an output given an input name
def evaluate(name_tensor):
#    hidden = rnn.initHidden()
    output = rnn(name_tensor)[-1]
#    for i in range(name_tensor.size()[0]):
#        output, hidden = rnn(name_tensor[i], hidden)

    return output

def get_accuracy(conf, n_conf, languages, names):
    count_correct = 0
    
    for lang in languages:
        for name in names[lang]:
            name_tensor = nameToTensor(name)
            output = evaluate(name_tensor)
            guess, guess_i = categoryFromOutput(output)
            
            if guess == lang:
                count_correct += 1
                
            category_i = languages.index(lang)
            conf[category_i][guess_i] += 1
    
    return count_correct/n_conf, conf

acc = get_accuracy(confusion, n_confusion, languages, names)
print(acc[0])

Fold:  1  | Epoch:  1 | train loss: 1.9089 | test accuracy: 0.462000
Fold:  1  | Epoch:  2 | train loss: 1.7752 | test accuracy: 0.503000
Fold:  1  | Epoch:  3 | train loss: 1.8073 | test accuracy: 0.474000
Fold:  1  | Epoch:  4 | train loss: 1.9098 | test accuracy: 0.461000
Fold:  1  | Epoch:  5 | train loss: 1.8841 | test accuracy: 0.475000
Fold:  2  | Epoch:  1 | train loss: 1.8851 | test accuracy: 0.456000
Fold:  2  | Epoch:  2 | train loss: 1.9103 | test accuracy: 0.448000
Fold:  2  | Epoch:  3 | train loss: 1.8077 | test accuracy: 0.502000
Fold:  2  | Epoch:  4 | train loss: 1.7753 | test accuracy: 0.499000
Fold:  2  | Epoch:  5 | train loss: 1.8400 | test accuracy: 0.465000
Fold:  3  | Epoch:  1 | train loss: 1.8183 | test accuracy: 0.470000
Fold:  3  | Epoch:  2 | train loss: 1.8647 | test accuracy: 0.471000
Fold:  3  | Epoch:  3 | train loss: 1.8893 | test accuracy: 0.461000
Fold:  3  | Epoch:  4 | train loss: 1.8473 | test accuracy: 0.472000
Fold:  3  | Epoch:  5 | train loss

# Test set

In [4]:

# %% training and testing
for i in range(len(kTest)):
    for epoch in range(1, epochs+1):
        bx, by = data_out(batch_size, kTrain[i])


        for batch in range(len(bx)):
            x = bx[batch]
            y = by[batch]

            output = rnn(x)                                 # rnn output
            loss = loss_func(output, torch.max(y, 1)[1])    # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients

        acc_output = rnn(x)                   # (samples, time_step, input_size)
        pred_y = torch.max(acc_output, 1)[1].data.numpy().squeeze()
        target = torch.max(y, 1)[1].data.numpy().squeeze()
        accuracy = sum(pred_y == target) / y.size(0)

        print("Fold: ", i+1, " | Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.6f' % accuracy)




# %%
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(languages)
    name = randomChoice(names[category])
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    name_tensor = nameToTensor(name)
    return category, name, category_tensor, name_tensor

# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 20074

# return an output given an input name
def evaluate(name_tensor):
#    hidden = rnn.initHidden()
    output = rnn(name_tensor)[-1]
#    for i in range(name_tensor.size()[0]):
#        output, hidden = rnn(name_tensor[i], hidden)

    return output

def get_accuracy(conf, n_conf, languages, names):
    count_correct = 0
    
    for lang in languages:
        for name in names[lang]:
            name_tensor = nameToTensor(name)
            output = evaluate(name_tensor)
            guess, guess_i = categoryFromOutput(output)
            
            if guess == lang:
                count_correct += 1
                
            category_i = languages.index(lang)
            conf[category_i][guess_i] +=  1
    
    return count_correct/n_conf, conf

acc = get_accuracy(confusion, n_confusion, languages, names)
print(acc[0])

Fold:  1  | Epoch:  1 | train loss: 1.2533 | test accuracy: 0.632000
Fold:  1  | Epoch:  2 | train loss: 1.0876 | test accuracy: 0.687000
Fold:  1  | Epoch:  3 | train loss: 1.0839 | test accuracy: 0.715000
Fold:  1  | Epoch:  4 | train loss: 0.9983 | test accuracy: 0.724000
Fold:  1  | Epoch:  5 | train loss: 0.9916 | test accuracy: 0.725000
Fold:  2  | Epoch:  1 | train loss: 1.0066 | test accuracy: 0.737000
Fold:  2  | Epoch:  2 | train loss: 0.8343 | test accuracy: 0.762000
Fold:  2  | Epoch:  3 | train loss: 0.7987 | test accuracy: 0.770000
Fold:  2  | Epoch:  4 | train loss: 0.8032 | test accuracy: 0.776000
Fold:  2  | Epoch:  5 | train loss: 0.7163 | test accuracy: 0.794000
Fold:  3  | Epoch:  1 | train loss: 0.7253 | test accuracy: 0.794000
Fold:  3  | Epoch:  2 | train loss: 0.6209 | test accuracy: 0.832000
Fold:  3  | Epoch:  3 | train loss: 0.6573 | test accuracy: 0.812000
Fold:  3  | Epoch:  4 | train loss: 0.6660 | test accuracy: 0.802000
Fold:  3  | Epoch:  5 | train loss