# G27
Dave Brunner

In [27]:
import glob
import os
import torch
import string

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker

from sklearn.metrics import confusion_matrix

### Load Data

The names can be found in text files in a src directory, one file per language.

In the following you can find some utilities to load the data into pandas data frames. 

We will restrict to some common European languages. 

With the given selection, we will identify all the occurring characters and initialize an alphabet.<br>
For this alphabet, we will use a one-hot-encoding to map them into a vector space representation. 

Foresee a suitable character for the end of the word, e.g. 'END'.

In [28]:
srcdir = 'data/names'
languages = ["English", "French", "Italian", "German", "Spanish"]

In [29]:
# inspect the data directory
def findFiles(path):
    return glob.glob(path)

In [30]:
print('\n'.join(findFiles(os.path.join(srcdir, '*.txt'))))

data/names/Czech.txt
data/names/German.txt
data/names/Arabic.txt
data/names/Japanese.txt
data/names/Chinese.txt
data/names/Vietnamese.txt
data/names/Russian.txt
data/names/French.txt
data/names/Irish.txt
data/names/English.txt
data/names/Spanish.txt
data/names/Greek.txt
data/names/Italian.txt
data/names/Portuguese.txt
data/names/Scottish.txt
data/names/Dutch.txt
data/names/Korean.txt
data/names/Polish.txt


In [31]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return lines


def load_data(srcdir, categories=None):
    names_list = []
    for filename in findFiles(os.path.join(srcdir, '*.txt')):
        category = os.path.splitext(os.path.basename(filename))[0]
        if not categories or category in categories:
            names = readLines(filename)
            names_list.extend([(name, category) for name in names])
    df = pd.DataFrame(names_list)
    df.columns = ["name", "lang"]
    return df

In [32]:
names = load_data(srcdir, categories=languages)
names.head()

Unnamed: 0,name,lang
0,Abbing,German
1,Abel,German
2,Abeln,German
3,Abt,German
4,Achilles,German


In [33]:
maxlen = np.max([len(name) for name in names.name])
print("Maximum name length: ", maxlen)

Maximum name length:  18


In [34]:
print(f'There are {len(names)} names in the dataset')

There are 5676 names in the dataset


In [35]:
alphabet = sorted(list(set(''.join([name for name in names.name]))))
alphabet.append('END')
len_alphabet = len(alphabet)
char_index = dict((c, i) for i, c in enumerate(alphabet))
print("Size of alphabet: ", len_alphabet)
print(alphabet)

Size of alphabet:  74
[' ', "'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Á', 'É', 'ß', 'à', 'á', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ñ', 'ò', 'ó', 'ö', 'ù', 'ú', 'ü', 'END']


In [36]:
names.groupby('lang')['name'].count() / len(names)

lang
English    0.646230
French     0.048802
German     0.127555
Italian    0.124912
Spanish    0.052502
Name: name, dtype: float64

### Vector Representations

Now construct the vector representation by using one-hot-vectors. 

In [37]:
language_to_index = {country: index for index, country in enumerate(names.lang.unique())}
index_to_language = {index: country for index, country in enumerate(names.lang.unique())}


def onehot(i, length):
    v = np.zeros(length);
    v[i] = 1
    return v


def name_representation(name, maxlen):
    name_trunc = str(name)[0:maxlen]
    size = len(char_index)
    vector = [onehot(char_index[j], size) for j in str(name)]
    # fill the rest with 
    for k in range(0, maxlen - len(str(name))):
        vector.append(onehot(char_index['END'], size))
    return vector


def lang_representation(language, language_to_index):
    y = np.zeros(len(language_to_index))
    y[language_to_index[language]] = 1
    return y


def lang_from_output(score):
    return index_to_language[np.argmax(score)]


def predict(name, model):
    score = model.predict(np.array([name_representation(name, maxlen)]))[0]
    return lang_from_output(score)

### Prepare train/test

Split the data into train/test

Shuffle the data

Transform the names data into a suitable vector respresentation:
* names into numpy arrays of shape (*,maxlen,len_alphabet)
* language into numpy array of shape (*,len(languages))



In [38]:
test_split = 0.2

# Shuffle and split names data
# names = names.sample(frac=1).reset_index(drop=True)
# print(names.head())

train = names[int(len(names) * test_split):]
test = names[:int(len(names) * test_split)]

In [39]:
# Map train and test data into vector space (one-hot-vectors)
X_train = np.array([name_representation(name, maxlen) for name in train.name])
Y_train = np.array([lang_representation(lang, language_to_index) for lang in train.lang])

X_test = np.array([name_representation(name, maxlen) for name in test.name])
Y_test = np.array([lang_representation(lang, language_to_index) for lang in test.lang])

In [40]:
print(X_train.shape)
print(X_test.shape)
print(X_train[0].shape)

(4541, 18, 74)
(1135, 18, 74)
(18, 74)


Possibly, pack the data into a Dataset (e.g. when working with in PyTorch)

In [41]:
# convert from numpy to torch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.float32)
train_dataset = torch.utils.data.TensorDataset(X_train, Y_train)

X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test, dtype=torch.float32)
test_dataset = torch.utils.data.TensorDataset(X_test, Y_test)

In [42]:
batch_size = 128

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

### Define and Train Model: Single Layer with SimpleRNN

Create an RNN consisting of a single layer with a SimpleRNN and a softmax.

Then train the model. Play with different number of hidden units in the layer to obtain a good accuracy.

In [43]:
from torch import nn

input_size = len_alphabet
hidden_size = 32
output_size = len(languages)


class ElmanRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, batch_first=True):
        super(ElmanRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                          batch_first=batch_first)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        # Initialize hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        print(h0.shape)
        # Forward propagate RNN
        out, hn = self.rnn(x, h0)
        # print(out.shape)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return self.activation(out)


model = ElmanRNN(input_size, hidden_size, output_size)
model

ElmanRNN(
  (rnn): RNN(74, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=5, bias=True)
  (activation): ReLU()
)

In [44]:
# batch_size = 128
# n_epochs = 10000
# learning_rate = 0.1
# model = ElmanRNN(input_size, hidden_size, output_size)
# 
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
# 
# for epoch in range(n_epochs):
#     correct = 0
#     total = 0
#     for i, (names, labels) in enumerate(train_loader):
#         optimizer.zero_grad()
# 
#         outputs = model(names)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         
#         # Get predictions from the maximum value
#         _, predicted = torch.max(outputs.data, 1)
#         labels = torch.max(labels, 1)[1]
# 
#         # Total number of labels
#         total += labels.size(0)
# 
#         # Total correct predictions
#         correct += (predicted == labels).sum()
#     
#     accuracy = 100 * correct / total
#     print(
#         f"Epoch [{epoch + 1}/{n_epochs}], "
#         f"Loss: {loss.item():.4f}, "
#         f"Accuracy: {accuracy :.4f}"
#     )


In [45]:
import torch
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.rnn = nn.RNN(input_size, hidden_size)
    self.linear = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidden):
    # print(x.shape)
    output, hidden = self.rnn(x, hidden)
    # print(output.shape)
    output = self.linear(output)
    # print(output.shape)
    return output, hidden

def transform_prediction(pred):
    # Get the index of the maximum value in the tensor
    max_index = torch.argmax(pred, dim=1)

    # Create a new tensor filled with zeros of the same shape as pred
    transformed_pred = torch.zeros_like(pred)

    # Use scatter to set the index corresponding to the maximum value to one
    transformed_pred.scatter_(1, max_index.unsqueeze(1), 1)
    # enable gradient
    
    return transformed_pred

def train(model, data_loader, criterion, optimizer, num_epochs):
  for epoch in range(num_epochs):
    for names, labels in data_loader:
      optimizer.zero_grad()
      pred, h = model(names, None)
      last_output = pred[:, -1, :]
      last_output = transform_prediction(last_output)
      
      loss = criterion(last_output, labels)
      loss.backward()
      optimizer.step()

    # Print training progress (optional)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Example usage (replace with your data loaders)
input_size = len_alphabet
hidden_size = 5
output_size = len(languages)
model = RNN(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  

train(model, train_loader, criterion, optimizer, num_epochs=100)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

#### Findings

...

### Implement Model with several SimpleRNN Layers

In [None]:
### START YOUR CODE

model = ...

### END YOUR CODE

In [None]:
### START YOUR CODE

# train...

### END YOUR CODE

#### Findings

...


### Class Imbalance Handling

Choose a method to address the class imbalance seen in the given example.
- minority resampling 
- class weights in the loss

Implement it and incorporate it in the training.
Evaluate the results and compare it with the results obtained with the unbalanced training.  

In [None]:
### START YOUR CODE

# train...

### END YOUR CODE