In [1]:
import glob
import os
import string

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker

from sklearn.metrics import confusion_matrix

### Load Data

The names can be found in text files in a src directory, one file per language.

In the following you can find some utilities to load the data into pandas data frames. 

We will restrict to some common European languages. 

With the given selection, we will identify all the occurring characters and initialize an alphabet.<br>
For this alphabet, we will use a one-hot-encoding to map them into a vector space representation. 

Foresee a suitable character for the end of the word, e.g. 'END'.

In [2]:
srcdir = 'data/names'
languages = ["English","French","Italian","German","Spanish"]

In [3]:
# inspect the data directory
def findFiles(path): 
    return glob.glob(path)

In [4]:
print('\n'.join(findFiles(os.path.join(srcdir,'*.txt'))))

data/names/French.txt
data/names/Scottish.txt
data/names/Dutch.txt
data/names/Arabic.txt
data/names/Chinese.txt
data/names/Polish.txt
data/names/Italian.txt
data/names/Japanese.txt
data/names/Russian.txt
data/names/Spanish.txt
data/names/Irish.txt
data/names/English.txt
data/names/German.txt
data/names/Portuguese.txt
data/names/Czech.txt
data/names/Korean.txt
data/names/Vietnamese.txt
data/names/Greek.txt


In [5]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return lines

def load_data(srcdir, categories=None):
    names_list = []
    for filename in findFiles(os.path.join(srcdir,'*.txt')):
        category = os.path.splitext(os.path.basename(filename))[0]
        if not categories or category in categories: 
            names = readLines(filename)
            names_list.extend([(name,category) for name in names])
    df = pd.DataFrame(names_list)
    df.columns = ["name","lang"]
    return df

In [6]:
# The names of the different languages
names = load_data(srcdir,categories=languages)
names.head()

Unnamed: 0,name,lang
0,Abel,French
1,Abraham,French
2,Adam,French
3,Albert,French
4,Allard,French


In [7]:
maxlen = np.max([len(name) for name in names.name])
print("Maximum name length: ", maxlen)

Maximum name length:  18


In [16]:
# Just the letters of the alphabet
alphabet = sorted(list(set(''.join([name for name in names.name]))))
alphabet.append('END')
len_alphabet = len(alphabet)
# Dict that maps characters to numbers
char_index = dict((c, i) for i, c in enumerate(alphabet))
print("Size of alphabet: ",len_alphabet)
print(alphabet)
print(char_index)

Size of alphabet:  74
[' ', "'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Á', 'É', 'ß', 'à', 'á', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ñ', 'ò', 'ó', 'ö', 'ù', 'ú', 'ü', 'END']
{' ': 0, "'": 1, 'A': 2, 'B': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'G': 8, 'H': 9, 'I': 10, 'J': 11, 'K': 12, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 20, 'T': 21, 'U': 22, 'V': 23, 'W': 24, 'X': 25, 'Y': 26, 'Z': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51, 'y': 52, 'z': 53, 'Á': 54, 'É': 55, 'ß': 56, 'à': 57, 'á': 58, 'ä': 59, 'ç': 60, 'è': 61, 'é': 62, 'ê': 63, 'ì': 64, 'í': 65, 'ñ': 66, 'ò': 67, 'ó

In [9]:
# Distribution of names in the data
# Percentage of names in that language
names.groupby('lang')['name'].count()/len(names)

lang
English    0.646230
French     0.048802
German     0.127555
Italian    0.124912
Spanish    0.052502
Name: name, dtype: float64

### Vector Representations

Now construct the vector representation by using one-hot-vectors. 

In [24]:
language_to_index = {country:index for index,country in enumerate(names.lang.unique())}
index_to_language = {index:country for index,country in enumerate(names.lang.unique())}

def onehot(i, length):
    v = np.zeros(length)
    v[i] = 1
    return v

def name_representation(name, maxlen):
    ### START YOUR CODE
    name_trunc = str(name)[0:maxlen]
    # Create vector of length "alphabet"
    # vector contains
    size = len(char_index)
    vector = [onehot(char_index.get(j), size) for j in str(name)]
    # fill the rest with 
    for k in range(0,maxlen - len(str(name))):
        vector.append(onehot(char_index['END'], size))
    return vector
    ### START YOUR CODE

def lang_representation(language, language_to_index):
    y = np.zeros(len(language_to_index))
    y[language_to_index.get(language)]=1
    return y

def lang_from_output(score):
    return index_to_language[np.argmax(score)]

def predict(name, model):
    score = model.predict(np.array([name_representation(name, maxlen)]))[0]
    return lang_from_output(score)

### Prepare train/test

Split the data into train/test

Shuffle the data

Transform the names data into a suitable vector respresentation:
* names into numpy arrays of shape (*,maxlen,len_alphabet)
* language into numpy array of shape (*,len(languages))



In [19]:
test_split = 0.2

### START YOUR CODE
# Shuffle and split names data
names = names.sample(frac=1)

train = names[:int(round((1-test_split)*len(names), 0))]

test = names[int(round((1-test_split)*len(names), 0)) : ]

assert len(train) + len(test) == len(names)
print(test.head(), end='\n----- \n')
print(train.head())

### END YOUR CODE

           name     lang
3105       Keys  English
2507  Gauntlett  English
2729     Hawker  English
769     Padovan  Italian
3792     Oriley  English
----- 
           name     lang
50    Chevrolet   French
2947    Jeffrey  English
4258    Sargent  English
1339     Aldren  English
1616       Bray  English


In [25]:
### START YOUR CODE

# Map train and test data into vector space (one-hot-vectors)

print(train.get('name').head)

X_train = name_representation( train.get('name'), maxlen=maxlen )
Y_train = lang_representation( train.get('name'), language_to_index=language_to_index )


X_test = name_representation( test.get('name'), maxlen=maxlen )
Y_test = lang_representation( test, language_to_index=language_to_index )

print(X_test.head())
print(Y_test.head())

### END YOUR CODE

<bound method NDFrame.head of 50       Chevrolet
2947       Jeffrey
4258       Sargent
1339        Aldren
1616          Bray
           ...    
323         Adessi
2515          Gell
4804    Wheatcroft
3756     Odriscoll
4978          Aust
Name: name, Length: 4541, dtype: object>


TypeError: unhashable type: 'Series'

Possibly, pack the data into a Dataset (e.g. when working with in PyTorch)

### Define and Train Model: Single Layer with SimpleRNN

Create an RNN consisting of a single layer with a SimpleRNN (keras) and a softmax.

Then train the model. Play with different number of hidden units in the layer to obtain a good accuracy.

In [11]:
### START YOUR CODE

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import SimpleRNN


# SimpleRNN, single layer with tf.keras....

model = Sequential()
model.add(SimpleRNN(128, activation='softmax'))



### END YOUR CODE

In [None]:
### START YOUR CODE
batch_size=
nepochs = 


### END YOUR CODE

#### Findings

...

### Implement Model with several SimpleRNN Layers

In [None]:
### START YOUR CODE

model = ...

### END YOUR CODE

In [None]:
### START YOUR CODE

# train...

### END YOUR CODE

#### Findings

...


### Class Imbalance Handling

Choose a method to address the class imbalance seen in the given example.
- minority resampling 
- class weights in the loss

Implement it and incorporate it in the training.
Evaluate the results and compare it with the results obtained with the unbalanced training.  

In [None]:
### START YOUR CODE

# train...

### END YOUR CODE