# POS Tagger  Gianluca Notaro

## Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import sklearn as sk
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
from keras import backend as K

Using TensorFlow backend.


## Hilfsfunktionen

In [2]:
def convert_to_stts(text):
    text = ''.join(text)
    splits = text.split(';')
    sentences = []
    tags = []
    for i in splits:
        i = i.split('/')
        if len(i)==2: #Discard last "Word" with no tag
            if i[0][:1] == '[':
                i[0] = i[0][2:]# remove [' from first word
            sentences.append(i[0].replace(' ', ''))
            tags.append(i[1].replace(' ',''))
    return [sentences,tags]

In [3]:
def remove_padding(tagslist):
    cleaned_tags = []
    for t in range(len(tagslist)):
        if tagslist[t] == '-PAD-':
            pass
        else:
            cleaned_tags.append(tagslist[t])
    return cleaned_tags

In [4]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
        token_sequences.append(token_sequence)
    return token_sequences

In [5]:
def token_to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [6]:
def list_flattener(taglist):
    flat_list = []
    for sublist in taglist:
        for tag in sublist:
            flat_list.append(tag)
    return flat_list

In [7]:
def print_accuracy(pred, actual):
    pred = list_flattener(pred)
    actual = list_flattener(actual)
    count = len(actual)
    correct = 0
    for i in range(len(actual)):
        if pred[i] == actual[i]:
            correct = correct +  1
    print('Accuracy: ', (correct/count))

In [8]:
def stts_to_uPOS(tag):
    switcher = {
    'ADJA': 'ADJ',
    'ADJD': 'ADJ',
    'ADV': 'ADV',
    'APPR': 'ADP',
    'APPRART': 'ADP',
    'APPO': 'ADP',
    'APZR': 'ADP',
    'ART': 'DET',
    'CARD': 'NUM',
    'FM': 'X',
    'ITJ': 'INTJ',
    'KOUI': 'SCONJ',
    'KOUS': 'SCONJ',
    'KON': 'CCONJ',
    'KOKOM': 'CCONJ',
    'NN': 'NOUN',
    'NE': 'PROPN',
    'PDS': 'PRON',
    'PDAT': 'DET',
    'PIS': 'PRON',
    'PIAT': 'DET',
    'PIDAT': 'DET',
    'PPER': 'PRON',
    'PPOSS': 'PRON',
    'PPOSAT': 'DET',
    'PRELS': 'PRON',
    'PRELAT': 'DET',
    'PRF': 'PRON',
    'PWS': 'PRON',
    'PWAT': 'DET',
    'PWAV': 'DET',
    'PAV': 'ADV',
    'PTKZU': 'PART',
    'PTKNEG': 'PART',
    'PTKVZ': 'ADP',
    'PTKANT': 'PART',
    'PTKA': 'PART',
    'VVFIN': 'VERB',
    'VVIMP': 'VERB',
    'VVINF': 'VERB',
    'VVIZU': 'VERB',
    'VVPP': 'VERB',
    'VAFIN': 'AUX',
    'VAIMP': 'AUX',
    'VAINF': 'AUX',
    'VAPP': 'AUX',
    'VMFIN': 'VERB',
    'VMINF': 'VERB',
    'VMPP': 'VERB',
    'XY': 'X',
    'SGML': 'X',
    'SPELL': 'X',
    'TRUNC': 'X',
    '$,': 'PUNCT',
    '$.': 'PUNCT',
    '$(': 'PUNCT'
    }
    return switcher.get(tag,'')

## Daten einlesen und vorbereiten

In [9]:
pos_train = pd.read_csv('POS_German_train.txt', delimiter='\t', header=None)
pos_train = pos_train.values.tolist()

pos_test = pd.read_csv('POS_German_minitest.txt', delimiter='\t', header=None)
pos_test = pos_test.values.tolist()

Splitten in tags  und Wörter

In [10]:
sentences = []
tags = []
lister = []

N = len(pos_train)
for i in range(N):
    lister.append(convert_to_stts(pos_train[i]))

for i in range(N):
    sentences.append(lister[i][0])
    tags.append(lister[i][1])

Tags umwandeln in POS-Tags

In [11]:
uPOS_tags  = []

for i in  tags:
    uPOS_tags.append(list(map(stts_to_uPOS,i)))

(words_train, words_test, tags_train, tags_test) = train_test_split(sentences, uPOS_tags, test_size=0.0)

words,tags = set([]), set([])

for s in words_train:
    for w in s:
        words.add(w.lower())
        
for ts in tags_train:
 for t  in ts:
        tags.add(t)

Tensorflow kann nicht mit Strings arbeiten, darum muss jedes Wort/Tag umgewandelt werden in einen integer Wert. Um mit Keras arbeiten zu können braucht es Sequenzen mit gleicher Länge, darum werden die Listen unten mit -PAD- auf gleihcer Länge gebracht. -OOV- steht für Out of word und wird beim testen gebraucht, falls ein Wort vorkommt der nicht im indexer ist. 

In [12]:
wordIndexer = {w: i + 2 for i, w in enumerate(list(words))}
wordIndexer['-PAD-'] = 0  # The special value used for padding
wordIndexer['-OOV-'] = 1  # The special value used for OOVs

tagIndexer = {t: i + 1 for i, t in enumerate(list(tags))}
tagIndexer['-PAD-'] = 0  # The special value used to padding
        
words_train_x, tags_train_y, words_minitest_x, tags_minitest_y = [],[],[],[]

In [13]:
for s in words_train:
    s_int = []
    for w in s:
        try:
            s_int.append(wordIndexer[w.lower()])
        except:
            s_int.append(wordIndexer['-OOV-'])
    words_train_x.append(s_int)
    
for s in tags_train:
    tags_train_y.append([tagIndexer[t] for t in s])
    
    
    
lister_test = []
sentences_minitest = []
tags_minitest = []

M = len(pos_test)
for i in range(M):
    lister_test.append(convert_to_stts(pos_test[i]))

for i in range(M):
    sentences_minitest.append(lister_test[i][0])
    tags_minitest.append(lister_test[i][1])
    
words_minitest_x = []
for s in sentences_minitest:
    s_int = []
    for w in s:
        try:
            s_int.append(wordIndexer[w.lower()])
        except:
            s_int.append(wordIndexer['-OOV-'])
    words_minitest_x.append(s_int)

tags_minitest_pos = []

for i in tags_minitest:
    tags_minitest_pos.append(list(map(stts_to_uPOS,i)))
tags_minitest_pos_y = []

for i in tags_minitest_pos:
      tags_minitest_pos_y.append([tagIndexer[t] for t in i])

In [14]:
MAX_LEN = len(max(words_train))
words_train_x = pad_sequences(words_train_x, maxlen=MAX_LEN, padding='post')
tags_train_y = pad_sequences(tags_train_y,maxlen=MAX_LEN, padding='post')
words_minitest_x = pad_sequences(words_minitest_x, maxlen=MAX_LEN, padding='post')
tags_minitest_pos_y = pad_sequences(tags_minitest_pos_y, maxlen=MAX_LEN, padding='post')

## Model vorbereiten und trainieren

In [15]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LEN, )))
model.add(Embedding(len(wordIndexer),128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tagIndexer))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
             optimizer=Adam(0.001),
             metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18, 128)           9443328   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 18, 512)           788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 18, 17)            8721      
_________________________________________________________________
activation_1 (Activation)    (None, 18, 17)            0         
Total params: 10,240,529
Trainable params: 10,240,529
Non-trainable params: 0
_________________________________________________________________


Der Input hier um den Model zu trainieren ist in One-Hot Encodings.

In [19]:
cat_tags_train_y = token_to_categorical(tags_train_y,len(tagIndexer))
model.fit(words_train_x, token_to_categorical(tags_train_y,len(tagIndexer)), batch_size=500, epochs=6, validation_split=0.4)

Train on 22284 samples, validate on 14856 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f702fa85a90>

 Wie man sieht, ist es met dem eigenen Validation set recht genau
 
 
 Sample Size wurde zufällig gewählt, Epochs wurde 6 genommen, weil nach 6 keine grosse Unterschiede mehr zu sehen waren, validation_split wurde der Vorschlag einer Googlesuche genommen.  

# Testing

Der Output vom model.predict ist eine List von Wahrscheinlichkeiten. Es ist recht schwierig, diesen Daten zu vergleichen. Darum werden sie in One-Hot Listen umgewandelt, danach in Tokens und das Padding entfernt. So können die Daten mit tags_minitest_pos_y verglichen werden und die Genauigkeit berechnet werden.

In [20]:
scores = model.predict(words_minitest_x)
result = logits_to_tokens(scores, {i: t for t, i in tagIndexer.items()})
cleaned_scores = []
results_y = []
for s in result:
    cleaned_scores.append([tagIndexer[t] for t in s])
    
for i in cleaned_scores:
    results_y.append(remove_padding(i))

## Test mit Testfile

In [21]:
print_accuracy(cleaned_scores, tags_minitest_pos_y)

Accuracy:  0.9495056497175142
