In [1]:
import gensim
import numpy as np

from keras.models import Model
from keras.preprocessing import sequence
from keras.layers.wrappers import Bidirectional
from keras.layers import Input, LSTM, Dropout, Embedding

Using TensorFlow backend.


In [2]:
paramopama = "corpus_paramopama+second_harem.txt"

In [3]:
max_seq_len = 0
vocabulary = set()
all_tags = set()

with open(paramopama, 'rt') as f_input:
    sentences_tokens = []
    sentences_tags = []
    tokens = []
    tags = []
    for line in f_input:
        if line=='\n':
            if len(tokens) > max_seq_len:
                max_seq_len = len(tokens)            
            sentences_tokens.append(tokens)
            sentences_tags.append(tags)
            tokens = []
            tags = []            
        else:
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag.strip())
            vocabulary.add(token)
            all_tags.add(tag.strip())

In [4]:
characters = set(char for token in vocabulary for char in token)

In [5]:
# create an index of tokens, adding two special tokens: 'PADDED' and 'UNKNOWN'
PADDED = 0
UNKNOWN = 1

char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for char, idx in char2idx.items()}

word2idx = {word: i + 2 for i, word in enumerate(vocabulary, 0)}
word2idx["PADDED"] = PADDED
word2idx["UNKNOWN"] = UNKNOWN
idx2word = {value: key for key, value in word2idx.items()} 

tag2idx={}
tag2idx["PADDED"] = PADDED
tag2idx.update({tag: i + 1 for i, tag in enumerate(all_tags, 0)})
idx2tag = {value: key for key, value in tag2idx.items()}

## Load Embeddings

In [6]:
fname = "publico_vectors_non-breaking-spaces.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
embeddings.vector_size

200

In [7]:
# build a word embeddings matrix, out of vocabulary words will be initialized randomly
embedding_matrix = np.random.random((len(embeddings.index2word), embeddings.vector_size))
not_found = 0

for word, i in word2idx.items():
    if word in embeddings.vocab:
        embedding_vector = embeddings[word]
        embedding_matrix[i] = embedding_vector
    else:
        not_found += 1

print('{} of {} tokens randomly initialized'.format(not_found, len(embeddings.index2word)))

15196 of 219475 tokens randomly initialized


In [8]:
vectors_tokens = []
for sentence in sentences_tokens:
    vector = []
    for word in sentence:
        vector.append(word2idx[word])
    vectors_tokens.append(vector)

In [9]:
len(vectors_tokens)

16275

In [10]:
vectors_tags = []
for sentence in sentences_tags:
    vector = []
    for tag in sentence:
        vector.append(tag2idx[tag])
    vectors_tags.append(vector)

In [11]:
len(vectors_tags)

16275

In [12]:
def token2char(token):
    word_char_idx = []
    for char in token:
        word_char_idx.append(char2idx[char])
    return np.array(word_char_idx)

In [13]:
def pad_nested_sequences(sequences, dtype='int32'):
    """Pads nested sequences to the same length.
    This function transforms a list of list sequences
    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.
    Args:
        sequences: List of lists of lists.
        dtype: Type of the output sequences.
    # Returns
        x: Numpy array.
    """
    max_sent_len = 0
    max_word_len = 0
    for sent in sequences:
        max_sent_len = max(len(sent), max_sent_len)
        for word in sent:
            max_word_len = max(len(word), max_word_len)

    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
    for i, sent in enumerate(sequences):
        for j, word in enumerate(sent):
            x[i, j, :len(word)] = word

    return x


In [14]:
MAX_LENGTH = max_seq_len  # max([len(sentence) for sentence in sent_text])

In [15]:
text_padded = sequence.pad_sequences(vectors_tokens, padding='post', maxlen=MAX_LENGTH, truncating='post')
tags_padded = sequence.pad_sequences(vectors_tags, padding='post', maxlen=MAX_LENGTH, truncating='post')

In [16]:
char_ids = [[token2char(idx2word[token_idx]) for token_idx in doc] for doc in vectors_tokens]
char_vectors_padded = pad_nested_sequences(char_ids)    

In [17]:
features = [np.array(text_padded), char_ids]

In [18]:
use_char = True
use_crf = True

char_vocab_size = 10
char_embedding_dim = 25
char_lstm_size = 25
dropout = 0.3

word_lstm_size = 50

fc_dim = 100 # output fully-connected layer size
num_labels = len(tag2idx.keys())

In [19]:
from keras.utils import to_categorical

In [20]:
len(tag2idx.keys())
y = to_categorical(tags_padded, len(tag2idx.keys())).astype(int)
y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)

In [21]:
idx2tag

{0: 'PADDED', 1: 'PESSOA', 2: 'O', 3: 'TEMPO', 4: 'LOCAL', 5: 'ORGANIZACAO'}

In [22]:
tags_padded[3]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 5, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [23]:
word_vocab_size = len(idx2word.keys())
word_embedding_dim = 100

In [24]:
from keras import metrics
from keras.layers import TimeDistributed, Dense, Input
from keras.layers import concatenate

In [25]:
n_words = len(idx2word.keys())
n_chars = len(idx2char.keys())
max_len = 183
max_len_char = 35
n_tags = len(idx2tag.keys())

## 80% train 20% test

In [117]:
split=0.33
train_test_split = int(len(text_padded)*split)

train_text_padded = np.array(text_padded)[0:train_test_split]
train_char_padded = char_vectors_padded[0:train_test_split]

test_text_padded = np.array(text_padded)[train_test_split:]
test_char_padded = char_vectors_padded[train_test_split:]

y_train = y[0:train_test_split]
y_test = y[train_test_split:]

train_features = [np.array(train_text_padded), train_char_padded]
test_features = [np.array(test_text_padded), test_char_padded]

In [118]:
tag2idx

{'LOCAL': 4, 'O': 2, 'ORGANIZACAO': 5, 'PADDED': 0, 'PESSOA': 1, 'TEMPO': 3}

In [119]:
len(y)

16275

In [120]:
len(tags_padded)

16275

In [121]:
y_train = y[0:train_test_split]

In [122]:
y_train.shape

(5370, 183, 6)

In [123]:
from keras_contrib.layers import CRF

crf = CRF(6, learn_mode='join', sparse_target=False)
loss = crf.loss_function
metric=[crf.accuracy]

x = Input(shape=(max_len,))
embeddings = Embedding(input_dim=n_words + 1, output_dim=200, 
                       input_length=max_len, 
                       mask_zero=True, 
                       trainable=True)(x)

lstm_out = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1), 
                         merge_mode='concat')(embeddings)
pred = Dense(6, activation="softmax")(lstm_out)
out = crf(pred)
model = Model(x, out)

model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 183)               0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 183, 200)          7706600   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 183, 256)          336896    
_________________________________________________________________
dense_17 (Dense)             (None, 183, 6)            1542      
_________________________________________________________________
crf_19 (CRF)                 (None, 183, 6)            90        
Total params: 8,045,128
Trainable params: 8,045,128
Non-trainable params: 0
_________________________________________________________________


In [124]:
history = model.fit(train_features[0], y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [125]:
predictions = model.predict(train_features[0])

In [127]:
all_predictions = []
sentence_predictions = []

for sentence in predictions:
    for tag in sentence:
        sentence_predictions.append(idx2tag[np.argmax(tag)])
        all_tags.add(idx2tag[np.argmax(tag)])
    all_predictions.append(sentence_predictions)
    sentence_predictions = []

In [128]:
len(all_predictions)

5370

# evaluate agains training data: (as a test, even with low training should produce positive results)

In [129]:
all_true = []
for sentence in y_train:    
    sent_true = [idx2tag[np.argmax(tag)] for tag in sentence]
    all_true.append(sent_true)

In [130]:
len(all_true)

5370

In [106]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       LOCAL       0.00      0.00      0.00     10403
           O       0.85      1.00      0.92    116298
 ORGANIZACAO       0.00      0.00      0.00      2357
      PADDED       1.00      1.00      1.00    845960
      PESSOA       0.00      0.00      0.00      2458
       TEMPO       0.88      0.08      0.15      5234

   micro avg       0.98      0.98      0.98    982710
   macro avg       0.46      0.35      0.35    982710
weighted avg       0.97      0.98      0.97    982710



In [88]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       LOCAL       0.00      0.00      0.00      5751
           O       0.86      1.00      0.93     71129
 ORGANIZACAO       0.00      0.00      0.00      1499
      PADDED       1.00      1.00      1.00    512954
      PESSOA       0.00      0.00      0.00      1541
       TEMPO       0.86      0.05      0.09      2791

   micro avg       0.98      0.98      0.98    595665
   macro avg       0.45      0.34      0.34    595665
weighted avg       0.97      0.98      0.97    595665



In [80]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       LOCAL       0.00      0.00      0.00      5751
           O       0.86      1.00      0.93     71129
 ORGANIZACAO       0.00      0.00      0.00      1499
      PADDED       1.00      1.00      1.00    512954
      PESSOA       0.00      0.00      0.00      1541
       TEMPO       0.91      0.03      0.06      2791

   micro avg       0.98      0.98      0.98    595665
   macro avg       0.46      0.34      0.33    595665
weighted avg       0.97      0.98      0.97    595665

