In [None]:
import gensim
import numpy as np

from keras.models import Model
from keras.preprocessing import sequence
from keras.layers.wrappers import Bidirectional
from keras.layers import Input, LSTM, Dropout, Embedding

Using TensorFlow backend.


In [2]:
paramopama = "corpus_paramopama+second_harem.txt"

In [3]:
max_seq_len = 0
vocabulary = set()
all_tags = set()

with open(paramopama, 'rt') as f_input:
    sentences_tokens = []
    sentences_tags = []
    tokens = []
    tags = []
    for line in f_input:
        if line=='\n':
            if len(tokens) > max_seq_len:
                max_seq_len = len(tokens)            
            sentences_tokens.append(tokens)
            sentences_tags.append(tags)
            tokens = []
            tags = []            
        else:
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag.strip())
            vocabulary.add(token)
            all_tags.add(tag.strip())

In [4]:
characters = set(char for token in vocabulary for char in token)

In [5]:
# create an index of tokens, adding two special tokens: 'PADDED' and 'UNKNOWN'
PADDED = 0
UNKNOWN = 1

char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for char, idx in char2idx.items()}

word2idx = {word: i + 2 for i, word in enumerate(vocabulary, 0)}
word2idx["PADDED"] = PADDED
word2idx["UNKNOWN"] = UNKNOWN
idx2word = {value: key for key, value in word2idx.items()} 

tag2idx={}
tag2idx["PADDED"] = PADDED
tag2idx.update({tag: i + 1 for i, tag in enumerate(all_tags, 0)})
idx2tag = {value: key for key, value in tag2idx.items()}

## Load Embeddings

In [6]:
fname = "publico_vectors_non-breaking-spaces.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
embeddings.vector_size

200

In [7]:
# build a word embeddings matrix, out of vocabulary words will be initialized randomly
embedding_matrix = np.random.random((len(embeddings.index2word), embeddings.vector_size))
not_found = 0

for word, i in word2idx.items():
    if word in embeddings.vocab:
        embedding_vector = embeddings[word]
        embedding_matrix[i] = embedding_vector
    else:
        not_found += 1

print('{} of {} tokens randomly initialized'.format(not_found, len(embeddings.index2word)))

15196 of 219475 tokens randomly initialized


In [8]:
vectors_tokens = []
for sentence in sentences_tokens:
    vector = []
    for word in sentence:
        vector.append(word2idx[word])
    vectors_tokens.append(vector)

In [9]:
len(vectors_tokens)

16275

In [10]:
vectors_tags = []
for sentence in sentences_tags:
    vector = []
    for tag in sentence:
        vector.append(tag2idx[tag])
    vectors_tags.append(vector)

In [11]:
len(vectors_tags)

16275

In [12]:
def token2char(token):
    word_char_idx = []
    for char in token:
        word_char_idx.append(char2idx[char])
    return np.array(word_char_idx)

In [13]:
def pad_nested_sequences(sequences, dtype='int32'):
    """Pads nested sequences to the same length.
    This function transforms a list of list sequences
    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.
    Args:
        sequences: List of lists of lists.
        dtype: Type of the output sequences.
    # Returns
        x: Numpy array.
    """
    max_sent_len = 0
    max_word_len = 0
    for sent in sequences:
        max_sent_len = max(len(sent), max_sent_len)
        for word in sent:
            max_word_len = max(len(word), max_word_len)

    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
    for i, sent in enumerate(sequences):
        for j, word in enumerate(sent):
            x[i, j, :len(word)] = word

    return x


In [14]:
MAX_LENGTH = max_seq_len  # max([len(sentence) for sentence in sent_text])

In [15]:
text_padded = sequence.pad_sequences(vectors_tokens, padding='post', maxlen=MAX_LENGTH, truncating='post')
tags_padded = sequence.pad_sequences(vectors_tags, padding='post', maxlen=MAX_LENGTH, truncating='post')

In [16]:
char_ids = [[token2char(idx2word[token_idx]) for token_idx in doc] for doc in vectors_tokens]
char_vectors_padded = pad_nested_sequences(char_ids)    

In [17]:
features = [np.array(text_padded), char_ids]

In [18]:
use_char = True
use_crf = True

char_vocab_size = 10
char_embedding_dim = 25
char_lstm_size = 25
dropout = 0.3

word_lstm_size = 50

fc_dim = 100 # output fully-connected layer size
num_labels = len(tag2idx.keys())

In [19]:
from keras.utils import to_categorical

In [20]:
len(tag2idx.keys())
y = to_categorical(tags_padded, len(tag2idx.keys())).astype(int)
y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)

In [43]:
idx2tag

{0: 'PADDED', 1: 'ORGANIZACAO', 2: 'TEMPO', 3: 'LOCAL', 4: 'PESSOA', 5: 'O'}

In [44]:
tags_padded[3]

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 5, 5, 1, 5, 5,
       3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [21]:
word_vocab_size = len(idx2word.keys())
word_embedding_dim = 100

In [22]:
from keras import metrics
from keras.layers import TimeDistributed, Dense, Input
from keras.layers import concatenate

In [23]:
n_words = len(idx2word.keys())
n_chars = len(idx2char.keys())
max_len = 183
max_len_char = 35
n_tags = len(idx2tag.keys())

## 80% train 20% test

In [24]:
split=0.8
train_test_split = int(len(text_padded)*split)

train_text_padded = np.array(text_padded)[0:train_test_split]
train_char_padded = char_vectors_padded[0:train_test_split]

test_text_padded = np.array(text_padded)[train_test_split:]
test_char_padded = char_vectors_padded[train_test_split:]

y_train = y[0:train_test_split]
y_test = y[train_test_split:]

train_features = [np.array(train_text_padded), train_char_padded]
test_features = [np.array(test_text_padded), test_char_padded]

In [28]:
tag2idx

{'LOCAL': 3, 'O': 5, 'ORGANIZACAO': 1, 'PADDED': 0, 'PESSOA': 4, 'TEMPO': 2}

In [45]:
len(y)

16275

In [46]:
len(tags_padded)

16275

In [96]:
y_train = y[0:train_test_split]

In [97]:
y_train.shape

(13020, 183, 6)

In [105]:
x = Input(shape=(max_len,))
embeddings = Embedding(input_dim=n_words + 1, output_dim=20, 
                       input_length=max_len, mask_zero=True)(x)

lstm_out = LSTM(units=25, return_sequences=True, recurrent_dropout=0.1)(embeddings)  # variational biLSTM
pred = Dense(6, activation="softmax")(lstm_out)  # a dense layer as suggested by neuralNer
model = Model(x, pred)

model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 183)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 183, 20)           770660    
_________________________________________________________________
lstm_8 (LSTM)                (None, 183, 25)           4600      
_________________________________________________________________
dense_8 (Dense)              (None, 183, 6)            156       
Total params: 775,416
Trainable params: 775,416
Non-trainable params: 0
_________________________________________________________________


In [106]:
history = model.fit(train_features[0], y_train)

Epoch 1/1


In [107]:
predictions = model.predict(test_features[0])

In [108]:
predictions.shape

(3255, 183, 6)

In [110]:
all_predictions = []
sentence_predictions = []

for sentence in predictions:
    for tag in sentence:
        sentence_predictions.append(idx2tag[np.argmax(tag)])
        all_tags.add(idx2tag[np.argmax(tag)])
    all_predictions.append(sentence_predictions)
    sentence_predictions = []

In [111]:
len(all_predictions)

3255

In [112]:
all_true = []
for sentence in y_test:    
    sent_true = [idx2tag[np.argmax(tag)] for tag in sentence]
    all_true.append(sent_true)

In [113]:
len(all_true)

3255

In [114]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       LOCAL       0.26      0.40      0.32      1485
           O       0.09      0.99      0.17     54591
 ORGANIZACAO       0.00      0.00      0.00      1213
      PADDED       0.00      0.00      0.00    532567
      PESSOA       1.00      0.00      0.00      3164
       TEMPO       0.42      0.01      0.01      2645

   micro avg       0.09      0.09      0.09    595665
   macro avg       0.30      0.23      0.08    595665
weighted avg       0.02      0.09      0.02    595665

