In [1]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Mapping STTS nach Universal Tagset

In [2]:
# Mapping STTS nach Universal Tagset
# Das Mapping wurde grundsätzlich aus folgender Quelle übernommen:
# https://raw.githubusercontent.com/slavpetrov/universal-pos-tags/master/de-tiger.map
# und mit dieser Quelle überprüft.
# https://pdfs.semanticscholar.org/ed2c/c779c7eb0004bd6dd50538a2cafca092c94f.pdf
# 
# Aufgrund von TIGER_scheme-syntax.pfd Seite 122ff. wurden folgende Anpassungen vorgenommen: 
# NNE gelöscht, da nicht Teil des Tagsets
# 
# Folgende drei Tags wurden hinzugefügt, obwohl diese nicht im Trainingset vorkommen,
# diese könnten jedoch im Testset des Dozenten auftauchen.
# 
# PAV PRON (Gleichbedeutend wie PROAV) 
# PIDAT PRON
# SGML  X
# SPELL X

stts_to_universal = {
"$(":".",
"$,":".",
"$.":".",
"ADJA":"ADJ",
"ADJD":"ADJ",
"ADV":"ADV",
"APPO":"ADP",
"APPR":"ADP",
"APPRART":"ADP",
"APZR":"ADP",
"ART":"DET",
"CARD":"NUM",
"FM":"X",
"ITJ":"X",
"KOKOM":"CONJ",
"KON":"CONJ",
"KOUI":"CONJ",
"KOUS":"CONJ",
"NE":"NOUN",
"NN":"NOUN",
"PDAT":"PRON",
"PDS":"PRON",
"PIAT":"PRON",
"PIS":"PRON",
"PPER":"PRON",
"PPOSAT":"PRON",
"PPOSS":"PRON",
"PRELAT":"PRON",
"PRELS":"PRON",
"PRF":"PRON",
"PAV":"PRON",
"PROAV":"PRON",
"PTKA":"PRT",
"PTKANT":"PRT",
"PTKNEG":"PRT",
"PTKVZ":"PRT",
"PTKZU":"PRT",
"PWAT":"PRON",
"PWAV":"PRON",
"PWS":"PRON",
"TRUNC":"X",
"VAFIN":"VERB",
"VAIMP":"VERB",
"VAINF":"VERB",
"VAPP":"VERB",
"VMFIN":"VERB",
"VMINF":"VERB",
"VMPP":"VERB",
"VVFIN":"VERB",
"VVIMP":"VERB",
"VVINF":"VERB",
"VVIZU":"VERB",
"VVPP":"VERB",
"XY":"X",
"PIDAT":"PRON",
"SGML":"X",
"SPELL":"X"
}

### Hilfsfunktionen

In [3]:
def get_tagged_sentences(raw_lines):
    tagged_sentences = []
    for line in raw_lines:
        if line:
            tuples = [tuple(word_and_tag.strip().rsplit('/', 1)) 
                      for word_and_tag 
                      in line.strip()[:-1].split(" ; ")]
            tagged_sentences.append(tuples)
    return tagged_sentences

def create_index(sentences, special_values=[]):
    bag_of_words = [word for sentence in sentences for word in sentence]
    
    fdist_words = nltk.FreqDist(bag_of_words)

    index = {value : i + len(special_values) for i, value in enumerate(fdist_words.keys())} 
    
    for i, value in enumerate(special_values): 
        index[value]  = i

    reverse_index = dict([(value, key) for (key, value) in index.items()])

    return index, reverse_index

def translate(text, dictionnary, backup_value): 
    return np.array([dictionnary.get(value, backup_value) for value in text])
        
def split_tuples(arrays_of_tuples):
    arr_left, arr_right  = [], []

    for arr in arrays_of_tuples:
        left, right = zip(*arr)

        arr_left.append(np.asarray(left))
        arr_right.append(np.asarray(right))
    return arr_left, arr_right

def to_categorical_reverse(categorical_sents):
    # Keras includes method to_categorical but not the reverse operation
    # Method adapted from
    # https://stackoverflow.com/questions/47380663/numpy-reverse-keras-to-categorical
    categorical_sents_reversed = []
    for sent in categorical_sents:
        categorical_sents_reversed.append(np.array([np.argmax(y, axis=None, out=None) for y in sent]))

    return np.array(categorical_sents_reversed)


def split_too_long_sentences(sentences, max_length):
    sentences_splitted = []
    for i, sent in enumerate(sentences):
        while len(sent) > max_length:
            print("Too long sentence", len(sent), "at index", i, 
                  "splitting into", len(sent[:max_length]), "and", len(sent[max_length:]))
            sentences_splitted.append(sent[:max_length])
            sent = sent[max_length:]

        sentences_splitted.append(sent)
        
    return sentences_splitted

def remove_padding(sequences_padded, sequences_target_length):
    sequences_without_padding = []    
    
    for i, sent in enumerate(sequences_target_length):
        sequences_without_padding.append(sequences_padded[i][:len(sent)])
    
    return sequences_without_padding

def plot_history(h):
    # copied from TensorFlow_Intro.ipynb
    history_dict = h.history
    history_dict.keys()
    
    acc = h.history['acc']
    val_acc = h.history['val_acc']
    loss = h.history['loss']
    val_loss = h.history['val_loss']

    plt.rcParams['figure.figsize'] = (12.0, 4.0)

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs,     loss, 'bo', label='Training loss')      # "bo" is for "blue dot"
    plt.plot(epochs, val_loss, 'b' , label='Validation loss')    # b is for "solid blue line"
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    axes = plt.gca()
    axes.set_ylim([0.0,0.2])

    plt.show()



    plt.clf()   # clear figure
    acc_values = history_dict['acc']
    val_acc_values = history_dict['val_acc']

    plt.plot(epochs,     acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b' , label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    axes = plt.gca()
    axes.set_ylim([0.92,1])

    plt.show()

### Files öffnen

In [4]:
train_text_file = open("POS_German_train.txt", "r")
test_text_file  = open("POS_German_minitest.txt", "r")

train_lines = train_text_file.read().split('\n')
test_lines  = test_text_file.read().split('\n')

# create sents with (word, tag) tuples
train_tagged_sents      = get_tagged_sentences(train_lines)
test_tagged_sents       = get_tagged_sentences(test_lines)

# max sentence length based on longest train sentences (test sentences are considered unkonwn at first)
train_max_sent_length   = max(len(sentence) for sentence in train_tagged_sents)
print("Max Sent Length  ", train_max_sent_length)
# split too long test sentences, usually it would be done in a separate data cleansing step

number_of_test_sents_orig = len(test_tagged_sents)
test_tagged_sents       = split_too_long_sentences(test_tagged_sents, train_max_sent_length)
print("Length Test orig ", number_of_test_sents_orig, "and after split", len(test_tagged_sents))

# split (word, tag) into separate sents arrays
train_sents_words, train_sents_tags_stts = split_tuples(train_tagged_sents)
test_sents_words,  test_sents_tags_stts  = split_tuples(test_tagged_sents)

# map stts to universal tagset
train_sents_tags = [translate(t, stts_to_universal, "tag not found") for t in train_sents_tags_stts]
test_sents_tags  = [translate(t, stts_to_universal, "tag not found") for t in test_sents_tags_stts]


print("Raw Line:        ", train_lines[1][:80])
print("Tagged sentence  ", train_tagged_sents[1][:5])
print("Words            ", train_sents_words[1][:5])
print("Tags STTS        ", train_sents_tags_stts[1][:5])
print("Tags Universal   ", train_sents_tags[1][:5])
print()
print("Raw Line:        ", test_lines[1][:80])
print("Tagged sentence  ", test_tagged_sents[1][:5])
print("Words            ", test_sents_words[1][:5])
print("Tags STTS        ", test_sents_tags_stts[1][:5])
print("Tags Universal   ", test_sents_tags[1][:5])

Max Sent Length   130
Too long sentence 143 at index 221 splitting into 130 and 13
Length Test orig  472 and after split 473
Raw Line:         Konzernchefs/NN ; lehnen/VVFIN ; den/ART ; Milliardaer/NN ; als/APPR ; US-Praesi
Tagged sentence   [('Konzernchefs', 'NN'), ('lehnen', 'VVFIN'), ('den', 'ART'), ('Milliardaer', 'NN'), ('als', 'APPR')]
Words             ['Konzernchefs' 'lehnen' 'den' 'Milliardaer' 'als']
Tags STTS         ['NN' 'VVFIN' 'ART' 'NN' 'APPR']
Tags Universal    ['NOUN' 'VERB' 'DET' 'NOUN' 'ADP']

Raw Line:         Qualifikation/NN ; und/KON ; Ausbildung/NN ; von/APPR ; Mitarbeitern/NN ; privat
Tagged sentence   [('Qualifikation', 'NN'), ('und', 'KON'), ('Ausbildung', 'NN'), ('von', 'APPR'), ('Mitarbeitern', 'NN')]
Words             ['Qualifikation' 'und' 'Ausbildung' 'von' 'Mitarbeitern']
Tags STTS         ['NN' 'KON' 'NN' 'APPR' 'NN']
Tags Universal    ['NOUN' 'CONJ' 'NOUN' 'ADP' 'NOUN']


#### Test Mapping STTS nach Universal Tagset

In [5]:
def test_stts_to_universal_mapping(sentences_tags_stts):
    bag_of_tags = [tag for sentence in sentences_tags_stts for tag in sentence]
    stts_tags = sorted(nltk.FreqDist(bag_of_tags).keys())    
    universal_tags = translate(stts_tags, stts_to_universal, "error, tag not found")

    for (stts, universal) in zip(stts_tags, universal_tags):
        print(stts, universal)

test_stts_to_universal_mapping(train_sents_tags_stts)

$( .
$, .
$. .
ADJA ADJ
ADJD ADJ
ADV ADV
APPO ADP
APPR ADP
APPRART ADP
APZR ADP
ART DET
CARD NUM
FM X
ITJ X
KOKOM CONJ
KON CONJ
KOUI CONJ
KOUS CONJ
NE NOUN
NN NOUN
PAV PRON
PDAT PRON
PDS PRON
PIAT PRON
PIS PRON
PPER PRON
PPOSAT PRON
PPOSS PRON
PRELAT PRON
PRELS PRON
PRF PRON
PTKA PRT
PTKANT PRT
PTKNEG PRT
PTKVZ PRT
PTKZU PRT
PWAT PRON
PWAV PRON
PWS PRON
TRUNC X
VAFIN VERB
VAIMP VERB
VAINF VERB
VAPP VERB
VMFIN VERB
VMINF VERB
VMPP VERB
VVFIN VERB
VVIMP VERB
VVINF VERB
VVIZU VERB
VVPP VERB
XY X


### Zusätzliches Test/Dev Set erstellen
Es wird mit vier Sets gearbeitet

- Train
- Test

für möglichst hohe accuracy im POS_German_minitest.txt und 

- Train Partial (90% des Train sets)
- Dev (10% des Train sets)

während der Entwicklung zur Erabeitung möglichst guter Hyperparameter.


### Mapping Word>Index, Tag>Index und umgekehrt anlegen

Auch hier werden Mappings mit gesamtem Train Set erstellt und auch mit 90% der Trainings Daten zum Testen des Dev Sets.


In [6]:
# split and shuffle training set again into test and devset
(train_sents_words_partial, 
 dev_sents_words,
 train_sents_tags_partial,
 dev_sents_tags) = train_test_split(
                        train_sents_words,
                        train_sents_tags,
                        test_size=0.1)

# create index with 100% of train_sentences
word_to_index, index_to_word = create_index(train_sents_words, ["<PAD>","<UNK>"] )
tag_to_index,  index_to_tag  = create_index(train_sents_tags, ["<PAD>"])

# create index only with 90% of data (train_sents_words_partial)
word_to_index_dev, index_to_word_dev = create_index(train_sents_words_partial, ["<PAD>","<UNK>"] )
tag_to_index_dev,  index_to_tag_dev  = create_index(train_sents_tags_partial, ["<PAD>"])


print("Length Train         / Test sents:  ", len(train_sents_words), len(test_sents_words))
print("Length Train Partial / Dev  sents:  ", len(train_sents_words_partial), len(dev_sents_words))

print("Length          Word / Tag Index:   ", len(word_to_index), len(tag_to_index))
print("Length      Dev Word / Tag Index:   ", len(word_to_index_dev), len(tag_to_index_dev))

print("Word to index:     ", list(word_to_index.items())[-4:])
print("Index to word:     ", list(index_to_word.items())[-4:])
print("Word to index Dev: ", list(word_to_index_dev.items())[-4:])
print("Index to word Dev: ", list(index_to_word_dev.items())[-4:])

for (a, b) in zip(tag_to_index.items(), index_to_tag.items()):
    print(a,b)

Length Train         / Test sents:   40000 473
Length Train Partial / Dev  sents:   36000 4000
Length          Word / Tag Index:    76658 13
Length      Dev Word / Tag Index:    71886 13
Word to index:      [('Schiesser', 76656), ('Waeschefirma', 76657), ('<PAD>', 0), ('<UNK>', 1)]
Index to word:      [(76656, 'Schiesser'), (76657, 'Waeschefirma'), (0, '<PAD>'), (1, '<UNK>')]
Word to index Dev:  [('KMK-Praesident', 71884), ('Hochschulausgaben', 71885), ('<PAD>', 0), ('<UNK>', 1)]
Index to word Dev:  [(71884, 'KMK-Praesident'), (71885, 'Hochschulausgaben'), (0, '<PAD>'), (1, '<UNK>')]
('.', 1) (1, '.')
('NOUN', 2) (2, 'NOUN')
('VERB', 3) (3, 'VERB')
('ADV', 4) (4, 'ADV')
('DET', 5) (5, 'DET')
('ADJ', 6) (6, 'ADJ')
('ADP', 7) (7, 'ADP')
('PRT', 8) (8, 'PRT')
('PRON', 9) (9, 'PRON')
('X', 10) (10, 'X')
('CONJ', 11) (11, 'CONJ')
('NUM', 12) (12, 'NUM')
('<PAD>', 0) (0, '<PAD>')


### Sätze zu Integer übersetzen

In [7]:
train_sents_words_int         = [translate(s, word_to_index,     word_to_index["<UNK>"])     for s in train_sents_words]
train_sents_tags_int          = [translate(s, tag_to_index,      tag_to_index["X"])          for s in train_sents_tags]

# the backup value "X" is never used because all tags are known
test_sents_words_int          = [translate(s, word_to_index,     word_to_index["<UNK>"])     for s in test_sents_words]
test_sents_tags_int           = [translate(s, tag_to_index,      tag_to_index["X"])          for s in test_sents_tags]

train_sents_words_partial_int = [translate(s, word_to_index_dev, word_to_index_dev["<UNK>"]) for s in train_sents_words_partial]
train_sents_tags_partial_int  = [translate(s, tag_to_index_dev,  tag_to_index_dev["X"])      for s in train_sents_tags_partial]

dev_sents_words_int           = [translate(s, word_to_index_dev, word_to_index_dev["<UNK>"]) for s in dev_sents_words]
dev_sents_tags_int            = [translate(s, tag_to_index_dev,  tag_to_index_dev["X"])      for s in dev_sents_tags]

print("Train Set")
print(train_sents_words[15][:7])
print(translate(train_sents_words_int[15],index_to_word,"<UNK>")[:7])
print(train_sents_words_int[15][:7])

print(train_sents_tags[15][:7])
print(translate(train_sents_tags_int[15][:7],index_to_tag,"X"))
print(train_sents_tags_int[15][:7])

print()
print("Test Set. Might contain <UNK> words, because translate-index was created only with train set.")
print(test_sents_words[15][:7])
print(translate(test_sents_words_int[15],index_to_word,"<UNK>")[:7])
print(test_sents_words_int[15][:7])

print(test_sents_tags[15][:7])
print(translate(test_sents_tags_int[15][:7],index_to_tag,"X"))
print(test_sents_tags_int[15][:7])

print()
print("Train Set Partial")
print(train_sents_words_partial[15][:7])
print(translate(train_sents_words_partial_int[15],index_to_word_dev,"<UNK>")[:7])
print(train_sents_words_partial_int[15][:7])

print(train_sents_tags_partial[15][:7])
print(translate(train_sents_tags_partial_int[15][:7],index_to_tag_dev,"X"))
print(train_sents_tags_partial_int[15][:7])

print()
print("Dev Set. Might contain <UNK> words, because translate-index was created only with train set partial.")
print(dev_sents_words[15][:7])
print(translate(dev_sents_words_int[15],index_to_word_dev,"<UNK>")[:7])
print(dev_sents_words_int[15][:7])

print(dev_sents_tags[15][:7])
print(translate(dev_sents_tags_int[15][:7],index_to_tag_dev,"X"))
print(dev_sents_tags_int[15][:7])

Train Set
['Er' 'waere' 'vielleicht' 'ein' 'praechtiger' 'Diktator' '-']
['Er' 'waere' 'vielleicht' 'ein' 'praechtiger' 'Diktator' '-']
[184   5   6   7   8   9 105]
['PRON' 'VERB' 'ADV' 'DET' 'ADJ' 'NOUN' '.']
['PRON' 'VERB' 'ADV' 'DET' 'ADJ' 'NOUN' '.']
[9 3 4 5 6 2 1]

Test Set. Might contain <UNK> words, because translate-index was created only with train set.
['Mit' 'den' 'Einnahmen' 'will' 'Ecclestone' 'vor' 'allem']
['Mit' 'den' 'Einnahmen' 'will' '<UNK>' 'vor' 'allem']
[  257    13 10094   378     1   322   323]
['ADP' 'DET' 'NOUN' 'VERB' 'NOUN' 'ADP' 'PRON']
['ADP' 'DET' 'NOUN' 'VERB' 'NOUN' 'ADP' 'PRON']
[7 5 2 3 2 7 9]

Train Set Partial
['Er' 'schloss' 'eine' 'Explosion' 'und' 'Sabotage' 'als']
['Er' 'schloss' 'eine' 'Explosion' 'und' 'Sabotage' 'als']
[191 192  49 193  40 194 186]
['PRON' 'VERB' 'DET' 'NOUN' 'CONJ' 'NOUN' 'ADP']
['PRON' 'VERB' 'DET' 'NOUN' 'CONJ' 'NOUN' 'ADP']
[ 5  6  1  7 10  7  4]

Dev Set. Might contain <UNK> words, because translate-index was created o

### Sätze padden

In [8]:
train_sents_words_int = keras.preprocessing.sequence.pad_sequences(train_sents_words_int, value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
train_sents_tags_int  = keras.preprocessing.sequence.pad_sequences(train_sents_tags_int,  value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
test_sents_words_int  = keras.preprocessing.sequence.pad_sequences(test_sents_words_int,  value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
test_sents_tags_int   = keras.preprocessing.sequence.pad_sequences(test_sents_tags_int,   value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)

train_sents_words_partial_int = keras.preprocessing.sequence.pad_sequences(train_sents_words_partial_int, value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
train_sents_tags_partial_int  = keras.preprocessing.sequence.pad_sequences(train_sents_tags_partial_int,  value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
dev_sents_words_int           = keras.preprocessing.sequence.pad_sequences(dev_sents_words_int,           value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)
dev_sents_tags_int            = keras.preprocessing.sequence.pad_sequences(dev_sents_tags_int,            value=word_to_index["<PAD>"], padding='post',maxlen=train_max_sent_length)


print(train_sents_words_int[15])
print(translate(train_sents_words_int[15],index_to_word,index_to_word[1]))
print(train_sents_tags_int[15])
print(translate(train_sents_tags_int[15],index_to_tag,index_to_tag[1]))

print(test_sents_words_int[15])
print(translate(test_sents_words_int[15],index_to_word,index_to_word[1]))
print(test_sents_tags_int[15])
print(translate(test_sents_tags_int[15],index_to_tag,index_to_tag[1]))

print(train_sents_words_partial_int[15])
print(translate(train_sents_words_partial_int[15],index_to_word_dev,index_to_word_dev[1]))
print(train_sents_tags_partial_int[15])
print(translate(train_sents_tags_partial_int[15],index_to_tag_dev,index_to_tag_dev[1]))

print(dev_sents_words_int[15])
print(translate(dev_sents_words_int[15],index_to_word_dev,index_to_word_dev[1]))
print(dev_sents_tags_int[15])
print(translate(dev_sents_tags_int[15],index_to_tag_dev,index_to_tag_dev[1]))

[184   5   6   7   8   9 105  74  30 185  92 186 187  36  10   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]
['Er' 'waere' 'vielleicht' 'ein' 'praechtiger' 'Diktator' '-' 'aber' 'das'
 'ist' 'nicht' 'unser' 'System' '.' "''" '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>

### Model erstellen

In [9]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, GRU, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam


def create_model(n_words_longest_sentence, n_distinct_words, n_distinct_tags):
    
    # https://keras.io/getting-started/sequential-model-guide/
    model = Sequential()
    # https://keras.io/layers/embeddings/
    # Embedding(n_distinct_words + 1   :: (input_dim should equal size of vocabulary + 1)
    # mask_zero = True                 :: ignores padding
    model.add(Embedding(n_distinct_words + 1, 128, mask_zero=True, input_shape=(n_words_longest_sentence, )))
    # https://keras.io/layers/recurrent/
    # https://nlpforhackers.io/lstm-pos-tagger-keras/
    # https://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/
    # return_sequences=True            :: needed for TimeDistributed
    # output_shape                     :: 128 gives best results    
    # go_backwards=True leads to bad performance
    # SimpleRNN  97.1 - 97.2   
    # GRU        97.2 - 97.7   
    # LSTM       97.4 - 97.8   best with small batch size (2)
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    # https://keras.io/layers/core/
    model.add(TimeDistributed(Dense(n_distinct_tags)))   
    # https://www.dlology.com/blog/how-to-choose-last-layer-activation-and-loss-function/
    # Multi-class, single-label classification :: Activation(softmax) loss='categorical_crossentropy'
    model.add(Dense(n_distinct_tags, activation='softmax'))
    model.compile(#https://keras.io/losses/
                  loss='categorical_crossentropy',
                  # https://keras.io/optimizers/
                  # https://www.dlology.com/blog/quick-notes-on-how-to-choose-optimizer-in-keras/
                  optimizer=Adam(0.001), # Adam performs better then 'rmsprop'
                  # https://keras.io/metrics/
                  metrics=['accuracy'])

    return model


def fit_model(m, sents_words_int, sents_tags_int, validation_split):
    hist = m.fit(sents_words_int, 
                    keras.utils.to_categorical(sents_tags_int), 
                    # because the batch_size is very small, best results are achieved already after 2 epochs
                    # but each epoch takes longer, because more steps are made
                    epochs=2,
                    # https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network
                    # https://stackoverflow.com/questions/35050753/how-big-should-batch-size-and-number-of-epochs-be-when-fitting-a-model-in-keras
                    #                 best val_acc (best epoch)      ~training_time per epoch (4 core 2.4GHz)
                    # batch_size=256  0.9690       (5 epochs)        171s 
                    # batch_size=128  0.9682       (5 epochs)        213s 
                    # batch_size=64   0.9701       (4 epochs)        269s
                    # batch_size=16   0.9735       (3 epochs)        672s
                    # batch_size=4    0.9758       (3 epochs)        2000s    
                    # batch_size=2    0.9771       (2 epochs)        3400s  Best
                    # batch_size=1    0.9741       (2 epochs)        8355s 
                    batch_size=2, # small batch_size leads to smaller loss, params are adjusted after each batch
                    validation_split=validation_split,
                    verbose=1)
    return hist

Using TensorFlow backend.


### Hilfsfunktionen

In [10]:
def predict_tags(trained_model, sentences_to_predict, sentences_to_predict_int, true_tags, dictionnary, log_sent=None):
    
    predictions_categorical     = trained_model.predict(sentences_to_predict_int)
    predictions_with_padding    = to_categorical_reverse(predictions_categorical)
    predictions_without_padding = remove_padding(predictions_with_padding, true_tags)
    predictions_tags            = [translate(tag, dictionnary,"error") for tag in predictions_without_padding]
    
    if log_sent:
        print("Predictions Categorical ", predictions_categorical[log_sent])
        print("Predictions with Padding", predictions_with_padding[log_sent])
        print("Predictions no   Padding", predictions_without_padding[log_sent])
        print("Orig words: ",     len(sentences_to_predict[log_sent]),
              "True tags: ",      len(true_tags[log_sent]),
              "Predicted tags: ", len(predictions_tags[log_sent]))
        for (a, b, c) in zip(true_tags[log_sent], predictions_tags[log_sent], sentences_to_predict[log_sent]):
            print(a,b,c)

    return predictions_tags

def calculate_custom_accuracy_without_padding(predicted_tags, true_tags):
    # calculates accuracy withot padding
    true_tags_flat = [tag for sentence in true_tags for tag in sentence]
    predicted_tags_flat = [tag for sentence in predicted_tags for tag in sentence]

    return accuracy_score(true_tags_flat, predicted_tags_flat)
    
    
def run_model(_train_max_sent_length, 
              _index_to_word, 
              _index_to_tag, 
              
              _train_sents_words_int, 
              _train_sents_tags_int, 
              
              _test_sents_words, 
              _test_sents_words_int, 
              _test_sents_tags, 
              _test_sents_tags_int, 
              _validation_split):

    model = create_model(_train_max_sent_length, len(_index_to_word), len(_index_to_tag))
    model.summary()
    history = fit_model(model, _train_sents_words_int, _train_sents_tags_int, _validation_split)
    
    # only two epochs used at the end, no fun in plotting those
    # plot_history(history)
    
    sents_predicted_tags = predict_tags(model, 
                                        _test_sents_words, 
                                        _test_sents_words_int, 
                                        _test_sents_tags,
                                        _index_to_tag,
                                        15)

    acc_keras  = model.evaluate(_test_sents_words_int, keras.utils.to_categorical(_test_sents_tags_int, len(_index_to_tag)))
    print("Accuracy Keras: ", acc_keras[1] * 100)

    acc_custom = calculate_custom_accuracy_without_padding(sents_predicted_tags, _test_sents_tags)
    print("Accuracy Custom (manually ignoring padding): ", acc_custom * 100)


### Modell trainieren mit 90% der Trainingsdaten (davon 10% zur Validierung) und mit 10% der Trainingsdaten testen

In [12]:
# Training Time ~2 hours using CPU
run_model(train_max_sent_length, 
              index_to_word_dev, 
              index_to_tag_dev,
          
              train_sents_words_partial_int, 
              train_sents_tags_partial_int,
          
              dev_sents_words,
              dev_sents_words_int,
              dev_sents_tags,
              dev_sents_tags_int,
              0.1)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 130, 128)          9201536   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 130, 256)          263168    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 130, 13)           3341      
_________________________________________________________________
dense_4 (Dense)              (None, 130, 13)           182       
Total params: 9,468,227
Trainable params: 9,468,227
Non-trainable params: 0
_________________________________________________________________
Train on 32400 samples, validate on 3600 samples
Epoch 1/2
Epoch 2/2
Predictions Categorical  [[6.7208314e-09 4.6107753e-09 1.1907963e-03 ... 9.9837840e-01
  7.2746104e-08 1.0743126e-06]
 [7.6444134e-14 3.3112693e-07 9.9939835e-01 ... 2.2150701e-04
  4.5508210e-07

### Modell trainieren mit 100% der Trainingsdaten und mit POS_German_minitest.txt testen

In [13]:
run_model(train_max_sent_length, 
              index_to_word, 
              index_to_tag,
          
              train_sents_words_int, 
              train_sents_tags_int,
          
              test_sents_words,
              test_sents_words_int,
              test_sents_tags,
              test_sents_tags_int,
              0.0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 130, 128)          9812352   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 130, 256)          263168    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 130, 13)           3341      
_________________________________________________________________
dense_6 (Dense)              (None, 130, 13)           182       
Total params: 10,079,043
Trainable params: 10,079,043
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Predictions Categorical  [[9.0430768e-10 2.5329872e-10 1.7422338e-06 ... 2.0075470e-06
  3.6228448e-05 8.4130125e-10]
 [7.0927370e-10 1.9370254e-09 1.1134790e-07 ... 1.2060173e-07
  6.7904354e-10 2.1664703e-06]
 [1.9098657e-17 1.3546241e-23 9