In [70]:
import sys
import os
import numpy as np 
from keras.utils import to_categorical
from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import keras.backend as K
from keras.callbacks import ModelCheckpoint
from keras_contrib.layers import CRF
from numpy import newaxis
import models
import sklearn
import subprocess
import fastText
import utils

In [71]:
def createMatrices(sentences, label2Idx, case2Idx, char2Idx):
    #{'numeric': 0, 'allLower': 1, 'contains_digit': 6, 'PADDING_TOKEN': 7, 'other': 4, 'allUpper': 2, 'mainly_numeric': 5, 'initialUpper': 3}

        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in sentence:  
            charIdx = []
            for x in char:
                if x in models.char2Idx.keys():
                    charIdx.append(models.char2Idx[x])
                else:
                    charIdx.append(models.char2Idx['UNKNOWN'])
            #Get the label and map to int            
            wordIndices.append(word)
            caseIndices.append(utils.getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

def padding(Sentences):
    maxlen = 52
    for sentence in Sentences:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(Sentences):
        Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
    return Sentences



In [72]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    return predLabels, correctLabels

In [73]:
trainSentences = utils.get_sentences_germeval('../data/germeval/NER-de-train.tsv')
devSentences = utils.get_sentences_germeval('../data/germeval/NER-de-dev.tsv')
testSentences = utils.get_sentences_germeval('../data/germeval/NER-de-test.tsv')

# trainSentences.extend(utils.get_sentences_conll('../data/CONLL/deu/deu_utf.train.bio'))
# devSentences.extend(utils.get_sentences_conll('../data/CONLL/deu/deu_utf.testa.bio'))
# testSentences.extend(utils.get_sentences_conll('../data/CONLL/deu/deu_utf.testb.bio'))

print(len(trainSentences))
print(len(devSentences))
print(len(testSentences))

24000
2200
5100


In [74]:
models.max_sequence_length = 56
# TODO replace with non-hardcoded version

In [75]:
print(testSentences[0])

[['1951', 'O'], ['bis', 'O'], ['1953', 'O'], ['wurde', 'O'], ['der', 'O'], ['nördliche', 'O'], ['Teil', 'O'], ['als', 'O'], ['Jugendburg', 'O'], ['des', 'O'], ['Kolpingwerkes', 'B-OTH'], ['gebaut', 'O'], ['.', 'O']]


In [76]:
# Load label mapping
import json
indexMappings = json.load(open("model_lstm_germeval_v2.0.h5.indexes", "r"))
models.idx2Label = {int(k):v for k,v in indexMappings[0].items()}
models.label2Idx = indexMappings[1]
models.char2Idx = indexMappings[2]
models.case2Idx = indexMappings[3]

In [77]:
print(models.idx2Label)
print(models.label2Idx)
print(models.char2Idx)
print(models.case2Idx)

{0: 'PADDING_TOKEN', 1: 'I-LOC', 2: 'B-ORGderiv', 3: 'I-ORGderiv', 4: 'I-OTH', 5: 'I-ORGpart', 6: 'B-OTHderiv', 7: 'I-OTHpart', 8: 'I-PERderiv', 9: 'O', 10: 'B-PER', 11: 'B-PERderiv', 12: 'B-LOC', 13: 'I-PERpart', 14: 'B-ORGpart', 15: 'I-ORG', 16: 'B-LOCpart', 17: 'I-LOCpart', 18: 'B-ORG', 19: 'I-PER', 20: 'I-OTHderiv', 21: 'B-LOCderiv', 22: 'I-LOCderiv', 23: 'B-PERpart', 24: 'B-OTHpart', 25: 'B-OTH'}
{'I-LOC': 1, 'B-ORGderiv': 2, 'I-ORG': 15, 'B-LOCpart': 16, 'B-ORGpart': 14, 'I-ORGderiv': 3, 'B-LOCderiv': 21, 'I-LOCpart': 17, 'B-ORG': 18, 'I-ORGpart': 5, 'B-PERpart': 23, 'B-OTHderiv': 6, 'I-OTHpart': 7, 'I-PERderiv': 8, 'O': 9, 'I-LOCderiv': 22, 'I-PER': 19, 'PADDING_TOKEN': 0, 'B-PER': 10, 'I-OTHderiv': 20, 'B-PERderiv': 11, 'I-OTH': 4, 'B-OTH': 25, 'B-LOC': 12, 'I-PERpart': 13, 'B-OTHpart': 24}
{'Å': 5, '[': 121, 'Ü': 6, 'к': 7, 'e': 8, 'ю': 248, 'П': 234, 'L': 9, '±': 10, 'µ': 11, 'ø': 12, 'г': 19, 'l': 17, '-': 18, 'n': 20, '8': 279, '³': 22, 'ő': 200, 'q': 23, 'Þ': 25, 'ρ': 175,

In [78]:
caseEmbeddings = np.identity(len(models.case2Idx), dtype='float32')

In [97]:
models.ft = fastText.load_model("../embeddings/wiki.de.bin")

nb_embedding_dims = ft.get_dimension()

In [90]:
def createBatches(dataset):
    l = []
    for i in dataset:
        l.append(len(i))
    l = set(l)
    print(len(l))
    batches = []
    batch_len = []
    z = 0
    for i in l:
        temp = []
        for batch in dataset:
            if len(batch) == i:
                temp.append(batch)
                z += 1
        batches.append(temp)
#         batch_len.append(z)
    return batches

In [91]:
test_batches = createBatches(testSentences)

47


In [92]:
def generator(batches: 'list of training/dev sentences- batches already created'):
    global line_number
    
    while True:
        for batch in batches:
            word_embeddings = []
            case_embeddings = []
            char_embeddings = []

            output_labels = []
            for index in range(len(batch)): # batches made according to the size of the sentences. len(batch) gives the size of current batch            
                sentence = batch[index]
                temp_casing = []
                temp_char=[]
                temp_word=[]
                temp_output=[]
                for word in sentence:
                    word, label = word
                    casing = utils.getCasing(word, models.case2Idx)
                    temp_casing.append(casing)
                    temp_char2=[]
                    for char in word:
                        if char in models.char2Idx.keys():
                            temp_char2.append(models.char2Idx[char])
                        else:
                            temp_char2.append(models.char2Idx['UNKNOWN']) # To incorporate the words which are not in the vocab
                    temp_char2 = np.array(temp_char2)
                    temp_char.append(temp_char2)
                    # word_vector = ft.get_word_vector(word.lower())
                    word_vector = ft.get_word_vector(word)
                    temp_word.append(word_vector)
                    temp_output.append(models.label2Idx[label])
                temp_char = pad_sequences(temp_char, 52)
                word_embeddings.append(temp_word)
                case_embeddings.append(temp_casing)
                char_embeddings.append(temp_char)
                temp_output = to_categorical(temp_output, len(models.idx2Label)+1)
                output_labels.append(temp_output)
            yield ([np.array(word_embeddings), np.array(case_embeddings), np.array(char_embeddings)], np.array(output_labels))

def get_label_from_categorical(a):
    labels = []
    for label in a:
        label = np.ndarray.tolist(label)
        label = np.argmax(label)
        labels.append(label)
    return(labels)

def predict_batches(batch):
    steps = 0
    true_labels = []
    pred_labels = []
    for input_data, output_data in generator(batch):
        pred_labels_batch = model.predict(input_data)
        for s in pred_labels_batch:
            pred_labels.append(get_label_from_categorical(s))
        for s in output_data:
            true_labels.append(get_label_from_categorical(s))
        steps += 1
        if steps == len(batch):
            break
    return(true_labels, pred_labels)

tmp_model_filename = 'model_lstm_germeval_v2.0.h5'
checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')

In [93]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [94]:
def get_model():
    caseEmbeddings = np.identity(len(models.case2Idx), dtype='float32')
    words_input = Input(shape=(None, nb_embedding_dims), dtype='float32', name='words_input')
    casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
    casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False, name = 'case_embed')(casing_input)
    character_input=Input(shape=(None,nb_char_embeddings,),name='char_input')
    embed_char_out=TimeDistributed(Embedding(len(models.char2Idx),32,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
    char_lstm = TimeDistributed(Bidirectional(LSTM(50, name = 'char_lstm')))(embed_char_out)
    output = concatenate([words_input, casing, char_lstm])
    output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.5, name = 'token_lstm'))(output)
    output = TimeDistributed(Dense(len(models.label2Idx), name = 'token_dense'))(output)
    crf = CRF(len(models.label2Idx), name = 'crf')
    output = crf(output)
    model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])
    model.compile(loss=crf.loss_function, optimizer='nadam', metrics=[crf.accuracy])
    model.summary()
    return(model)

In [95]:
nb_char_embeddings = 52
model = get_model()
model.load_weights(tmp_model_filename)
history = utils.F1History(tmp_model_filename, devSet = devSentences)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10688       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case_embed

In [98]:
model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=256), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

Epoch 1/10

KeyboardInterrupt: 

In [88]:
true_labels, pred_labels = predict_batches(test_batches)
sentences = [sent for batch in test_batches for sent in batch]
for i, (true_sentence, pred_sentence) in enumerate(zip(true_labels, pred_labels)):
    for j, (true_word, pred_word) in enumerate(zip(true_sentence, pred_sentence)):
        print((sentences[i][j], models.idx2Label[pred_word]))
        
    print()
print(compute_f1(true_labels, pred_labels, models.idx2Label))

(['Folge', 'O'], 'O')
(['ausgestrahlt', 'O'], 'O')
(['wird', 'O'], 'O')
(['.', 'O'], 'O')

(['Ergativkonstruktionen', 'O'], 'O')
(['sind', 'O'], 'O')
(['häufig', 'O'], 'O')
(['.', 'O'], 'O')

(['Auch', 'O'], 'O')
(['hier', 'O'], 'O')
(['Widerspruch', 'O'], 'O')
(['.', 'O'], 'O')

(['Und', 'O'], 'O')
(['Tischer', 'B-PER'], 'O')
(['versichert', 'O'], 'O')
(['.', 'O'], 'O')

(['Das', 'O'], 'O')
(['ist', 'O'], 'O')
(['nachprüfbar', 'O'], 'O')
(['.', 'O'], 'O')

(['Nicht', 'O'], 'O')
(['weggeschmissene', 'O'], 'O')
(['Briefe', 'O'], 'O')
(['.', 'O'], 'O')

(['Oder', 'O'], 'O')
(['vier', 'O'], 'O')
(['alle', 'O'], 'O')
(['?', 'O'], 'O')

(['Wir', 'O'], 'O')
(['werden', 'O'], 'O')
(['evakuiert', 'O'], 'O')
(['.', 'O'], 'O')

(['Münchner', 'B-ORG'], 'B-LOCderiv')
(['Neueste', 'I-ORG'], 'O')
(['Nachrichten', 'I-ORG'], 'O')
(['.', 'O'], 'O')

(['Sogar', 'O'], 'O')
(['mit', 'O'], 'O')
(['eigenem', 'O'], 'O')
(['Skilift', 'O'], 'O')
(['.', 'O'], 'O')

(['Das', 'O'], 'O')
(['Ergebnis', 'O'], 'O')
(