In [None]:
import torch
import torch.nn as nn
import os
from io import open
import glob
import unicodedata
import string
import numpy as np

In [None]:
data_path = './data/*.txt'

class LanguageDetect():
    def __init__(self, files_path='./data', limit_size=None):
        # scan language files
        lang_files = glob.glob(data_path)
        
        self.alphabet = string.ascii_lowercase
        self.wordlist = []
        self.labels = []
        self.classes = []
        self.padding = 0

        idx = 0
        for file in lang_files:
            language = os.path.splitext(os.path.basename(file))[0]
            self.classes.append(language)
            words = self._transform_vocab(file, limit_size)
            self.wordlist += words
            self.labels += [idx for i in range(len(words))]
            idx += 1
        
        self.padding = len(max(self.wordlist, key=len)) # size of the largest string
           
    def _unicodeToAscii(self, s):
        all_letters = string.ascii_letters
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in all_letters)

    def _transform_vocab(self, vocab, limit_size):
        wordset = (open(vocab).read()).lower() # open file and convert letters to lowercase
        wordset = ''.join(i for i in wordset if (i.isalpha() | i.isspace())).split() # remove all non alpha and single characters thus split into a list
        wordset = [word for word in wordset if len(word) > 1] # remove single characters and spaces
        wordset = [self._unicodeToAscii(word) for word in wordset] # convert to ASCII
        wordset = list(set(wordset)) # list of unique elements
        if limit_size is not None:
            wordset = wordset[:limit_size]
        return wordset


    def _word2tensor(self, word, padding=True):
        if padding:
            num_charac = self.padding # pad to the fill the size
        else:
            num_charac = len(word)
        len_word = len(word)
        wtensor = torch.zeros(num_charac, 1, len(self.alphabet))
        for iletter in range(len_word):
            wtensor[iletter][0][self.alphabet.find(word[iletter])] = 1
        return wtensor

    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        word = self.wordlist[idx]
        label = self.labels[idx]
        
        # convert a word to a tensor (len word x 1 x alphabet)
        word = self._word2tensor(word, padding=True)
        
        return word, label
        
        

In [None]:
dataset = LanguageDetect(data_path, limit_size=200)
alphabet = dataset.alphabet

classes = dataset.classes
print('dataset size:', len(dataset))
print('classes:', classes)

In [None]:
def tensor2word(tensor):
    alphabet = string.ascii_lowercase
    word = ''
    for i in range(tensor.size()[0]):
        for j in range(tensor.size()[2]):
            if tensor[i][0][j] == 1:
                word += alphabet[j]
    return word
        
    
# dataset dim = |idx (input,label) 
# first input
print(dataset[0][0].size())
# first label
print(dataset[0][1])


# sample example
sample_idx = np.random.randint(1, len(dataset))

print('random sample')
print('language:', classes[dataset[sample_idx][1]])
print('word:', tensor2word(dataset[sample_idx][0]))

In [None]:
# split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_data, val_data = torch.utils.data.random_split(dataset, [train_size, val_size])

# creating dataloaders
batch_size = 1

# create training data loader
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

# create validation data loader
val_loader = torch.utils.data.DataLoader(dataset=val_data, batch_size=batch_size, shuffle=True)

![RNN model](./model.png)

In [None]:
# simple RNN as in: 
# https://pytorch.org/tutorials/beginner/former_torchies/nnft_tutorial.html

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.i2h = nn.Linear(input_size+hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size+hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, last_hidden):
        combined = torch.cat((input, last_hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return hidden, output
    

In [None]:
hidden_size = 128

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Using gpu:", use_cuda)

In [None]:
rnn = RNN(len(alphabet), hidden_size, len(classes)).to(device)

In [None]:
def classify_output(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return classes[category_i], category_i

In [None]:
criterion = nn.NLLLoss()
learning_rate = 0.0001

In [None]:
import time
import math


def train(model, criterion, learning_rate=0.0001, epochs=1000, plots_per_epoch=100):
    
    cost = []
    start_time = time.time()
    
    for epoch in range(1, epochs+1):
        running_loss = 0.0
        running_corrects = 0
        
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            hidden = torch.zeros(1, hidden_size).to(device)
            rnn.zero_grad()

            word = inputs[0]
            for i in range(word.size()[0]): # go through each letter
                if torch.sum(word[i]) > 0: # avoid paddding
                    hidden, output = rnn(word[i], hidden)
                else: break

            loss = criterion(output, labels)
            loss.backward()

            # Add parameters' gradients to their values, multiplied by learning rate
            for p in rnn.parameters():
                p.data.add_(-learning_rate, p.grad.data)

            #current_loss += loss
            guess, guess_i = classify_output(output)
            if guess_i == labels:
                running_corrects += 1
            running_loss += loss
            
        cost.append(running_loss)
        epoch_acc = running_corrects / train_size
        
        if epoch % plots_per_epoch == 0:
            print('Epoch [{}] -> Loss: {:.4f}  Acc: {:.4f}'.format(
                epoch, running_loss/train_size, epoch_acc))

    
    time_elapsed = time.time() - start_time
    print()
    print('Training completed in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    
    return model, cost

In [None]:
_, all_losses = train(rnn, criterion, learning_rate, epochs=1000)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

In [None]:
# evaluate model

            
def calc_accuracy(model, dataloader, print_output=False):
    num_correct = 0
    num_examples = len(dataloader.dataset)                       # test DATA not test LOADER
    for inputs, labels in dataloader:                  # for all exampls, over all mini-batches in the test dataset
        inputs, labels = inputs.to(device), labels.to(device)
        predictions = evaluate(inputs[0])
        
        guess, guess_i = categoryFromOutput(output)
        if print_output:
            correct = '✓' if guess_i == labels else '✗ (%s)' % classes[labels]
            print('%s / %s %s' % (tensor2word(inputs[0]), guess, correct))
        
        if(guess_i == labels):
            num_correct += 1
               
    percent_correct = num_correct / num_examples * 100
    return percent_correct