In [23]:
import gensim
import numpy as np

from keras.models import Model
from keras.preprocessing import sequence
from keras.layers.wrappers import Bidirectional
from keras.layers import Input, LSTM, Dropout, Embedding

In [24]:
paramopama = "/Users/dsbatista/NER-datasets/Portuguese/Paramopama/corpus_paramopama+second_harem.txt"

In [25]:
max_seq_len = 0
vocabulary = set()
all_tags = set()

with open(paramopama, 'rt') as f_input:
    sentences_tokens = []
    sentences_tags = []
    tokens = []
    tags = []
    for line in f_input:
        if line=='\n':
            if len(tokens) > max_seq_len:
                max_seq_len = len(tokens)            
            sentences_tokens.append(tokens)
            sentences_tags.append(tags)
            tokens = []
            tags = []            
        else:
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag.strip())
            vocabulary.add(token)
            all_tags.add(tag.strip())

## generate statistics on the corpus

In [26]:
# TODO

In [27]:
# create an index of tokens, adding two special tokens: 'PADDED' and 'UNKNOWN'
PADDED = 0
UNKNOWN = 1

word2idx = {word: i + 2 for i, word in enumerate(vocabulary, 0)}
word2idx["PADDED"] = PADDED
word2idx["UNKNOWN"] = UNKNOWN
idx2word = {value: key for key, value in word2idx.items()} 

tag2idx={}
tag2idx["PADDED"] = PADDED
tag2idx.update({tag: i + 1 for i, tag in enumerate(all_tags, 0)})
idx2tag = {value: key for key, value in tag2idx.items()}

In [28]:
max_seq_len

183

In [29]:
len(vocabulary)

38530

In [35]:
index=10
for token, tag in zip(sentences_tokens[index], sentences_tags[index]):
    print(token,'\t', tag)

O 	 O
município 	 O
é 	 O
limitado 	 O
a 	 O
norte 	 O
pelo 	 O
município 	 O
de 	 O
Sever 	 LOCAL
do 	 LOCAL
Vouga 	 LOCAL
, 	 O
a 	 O
nordeste 	 O
por 	 O
Oliveira 	 LOCAL
de 	 LOCAL
Frades 	 LOCAL
e 	 O
por 	 O
Vouzela 	 LOCAL
, 	 O
a 	 O
leste 	 O
por 	 O
Tondela 	 LOCAL
, 	 O
a 	 O
sul 	 O
por 	 O
Mortágua 	 LOCAL
e 	 O
por 	 O
Anadia 	 LOCAL
, 	 O
a 	 O
sudoeste 	 O
por 	 O
Oliveira 	 LOCAL
do 	 LOCAL
Bairro 	 LOCAL
, 	 O
a 	 O
oeste 	 O
por 	 O
Aveiro 	 LOCAL
e 	 O
a 	 O
noroeste 	 O
por 	 O
Albergaria-a-Velha 	 LOCAL
. 	 O


## Load Embeddings

In [36]:
fname = "/Users/dsbatista/PycharmProjects/other/embeddings-http-endpoint/publico_vectors_non-breaking-spaces.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)

In [37]:
embeddings.vector_size

200

In [38]:
len(embeddings.index2word)

219475

In [39]:
# build a word embeddings matrix, out of vocabulary words will be initialized randomly
embedding_matrix = np.random.random((len(embeddings.index2word), embeddings.vector_size))
not_found = 0

for word, i in word2idx.items():
    if word in embeddings.vocab:
        embedding_vector = embeddings[word]
        embedding_matrix[i] = embedding_vector
    else:
        not_found += 1

print('{} of {} tokens randomly initialized'.format(not_found, len(embeddings.index2word)))

15196 of 219475 tokens randomly initialized


In [40]:
len(sentences_tags)

16275

In [41]:
vectors_tokens = []
for sentence in sentences_tokens:
    vector = []
    for word in sentence:
        vector.append(word2idx[word])
    vectors_tokens.append(vector)

In [42]:
len(vectors_tokens)

16275

In [43]:
vectors_tags = []
for sentence in sentences_tags:
    vector = []
    for tag in sentence:
        vector.append(tag2idx[tag])
    vectors_tags.append(vector)

In [44]:
len(vectors_tags)

16275

In [45]:
MAX_LENGTH = max_seq_len  # max([len(sentence) for sentence in sent_text])

In [46]:
text_padded = sequence.pad_sequences(vectors_tokens, padding='post', maxlen=MAX_LENGTH, truncating='post')
tags_padded = sequence.pad_sequences(vectors_tags, padding='post', maxlen=MAX_LENGTH, truncating='post')

## check that indexes and padding are correct

In [47]:
index = 1
for token, tag in zip(text_padded[index], tags_padded[index]):
    print(idx2word[token], '\t', idx2tag[tag])

Jana 	 O
Gana 	 O
Mana 	 O
é 	 O
o 	 O
hino 	 O
nacional 	 O
da 	 O
Índia 	 LOCAL
. 	 O
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED
PADDED 	 PADDED


# LSTM+CRF model

In [48]:
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
# from keras_tqdm import TQDMNotebookCallback

# sentence and embeddings
MAX_VOCAB = len(embeddings.index2word)
EMBEDDING_SIZE = 200

# network hyper-parameters
HIDDEN_SIZE = 200  # LSTM Nodes/Features/Dimension
DROPOUTRATE = 0.2
    
# text layers : dense embedding > dropout > bi-LSTM
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')

txt_embed = Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH, weights=[embedding_matrix], 
                      name='txt_embedding', trainable=True, mask_zero=True)(txt_input)

txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)

txt_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True), name='txt_bidirectional')(txt_drpot)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True), name='mrg_bidirectional_1')(txt_lstml)

# extra LSTM layer, if wanted
# mrg_drpot = Dropout(DROPOUTRATE, name='mrg_dropout')(mrg_lstml)
# mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
#                           name='mrg_bidirectional_2')(mrg_lstml)
# mrg_outpt = Activation('sigmoid', name='mrg_activation')(mrg_lstml)
# final linear chain CRF layer

dim_output_space = len(idx2tag.keys())

crf = CRF(dim_output_space, sparse_target=True)
mrg_chain = crf(mrg_lstml)
model = Model(inputs=[txt_input], outputs=mrg_chain)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
txt_input (InputLayer)       (None, 183)               0         
_________________________________________________________________
txt_embedding (Embedding)    (None, 183, 200)          43895000  
_________________________________________________________________
txt_dropout (Dropout)        (None, 183, 200)          0         
_________________________________________________________________
txt_bidirectional (Bidirecti (None, 183, 400)          641600    
_________________________________________________________________
mrg_bidirectional_1 (Bidirec (None, 183, 400)          961600    
_________________________________________________________________
crf_2 (CRF)                  (None, 183, 6)            2454      
Total params: 45,500,654
Trainable params: 45,500,654
Non-trainable params: 0
________________________________________________________________

In [49]:
y_train_ner = tags_padded[:, :, np.newaxis]  # reshape data for CRF

In [63]:
y_train_ner[0]

array([[3],
       [3],
       [3],
       [3],
       [3],
       [4],
       [4],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [1],
       [1],
       [1],
       [3],
       [3],
       [3],
       [5],
       [5],
       [3],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [50]:
len(text_padded)

16275

In [51]:
len(y_train_ner)

16275

In [53]:
SPLIT = 1000
BATCH_SIZE = 8
MAX_EPOCHS = 5  # max iterations, early stop condition below

history = model.fit(text_padded[:SPLIT], y_train_ner[:SPLIT], 
                    validation_split=0.2, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, 
                    verbose=1)
hist_dict = history.history

Train on 800 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
test_text_padded = text_padded[SPLIT:]
test_y_train_ner = y_train_ner[SPLIT:]

In [55]:
predictions = model.predict(test_text_padded)

In [56]:
all_tags = set()
all_predictions = []
sentence_predictions = []

for sentence in predictions:
    for tag in sentence:
        sentence_predictions.append(idx2tag[np.argmax(tag)])
        all_tags.add(idx2tag[np.argmax(tag)])
    all_predictions.append(sentence_predictions)
    sentence_predictions = []

In [57]:
len(all_predictions)

15275

In [58]:
all_true = []
for sentence_tags in test_y_train_ner:
    all_true.append([idx2tag[tag[0]] for tag in sentence_tags])

In [59]:
len(all_true)

15275

In [60]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))

              precision    recall  f1-score   support

       LOCAL       0.56      0.69      0.62     17208
           O       0.95      0.97      0.96    310258
 ORGANIZACAO       0.45      0.10      0.16      8418
      PADDED       1.00      1.00      1.00   2435567
      PESSOA       0.46      0.42      0.44     10573
       TEMPO       0.86      0.63      0.73     13301

   micro avg       0.99      0.99      0.99   2795325
   macro avg       0.71      0.64      0.65   2795325
weighted avg       0.99      0.99      0.99   2795325

