In [61]:
import gensim
import numpy as np

from keras.models import Model
from keras.preprocessing import sequence
from keras.layers.wrappers import Bidirectional
from keras.layers import Input, LSTM, Dropout, Embedding

In [62]:
paramopama = "corpus_paramopama+second_harem.txt"

In [63]:
max_seq_len = 0
vocabulary = set()
all_tags = set()

with open(paramopama, 'rt') as f_input:
    sentences_tokens = []
    sentences_tags = []
    tokens = []
    tags = []
    for line in f_input:
        if line=='\n':
            if len(tokens) > max_seq_len:
                max_seq_len = len(tokens)            
            sentences_tokens.append(tokens)
            sentences_tags.append(tags)
            tokens = []
            tags = []            
        else:
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag.strip())
            vocabulary.add(token)
            all_tags.add(tag.strip())

In [64]:
characters = set(char for token in vocabulary for char in token)

In [65]:
# create an index of tokens, adding two special tokens: 'PADDED' and 'UNKNOWN'
PADDED = 0
UNKNOWN = 1

char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for char, idx in char2idx.items()}

word2idx = {word: i + 2 for i, word in enumerate(vocabulary, 0)}
word2idx["PADDED"] = PADDED
word2idx["UNKNOWN"] = UNKNOWN
idx2word = {value: key for key, value in word2idx.items()} 

tag2idx={}
tag2idx["PADDED"] = PADDED
tag2idx.update({tag: i + 1 for i, tag in enumerate(all_tags, 0)})
idx2tag = {value: key for key, value in tag2idx.items()}

## Load Embeddings

In [66]:
fname = "publico_vectors_non-breaking-spaces.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
embeddings.vector_size

200

In [67]:
# build a word embeddings matrix, out of vocabulary words will be initialized randomly
embedding_matrix = np.random.random((len(embeddings.index2word), embeddings.vector_size))
not_found = 0

for word, i in word2idx.items():
    if word in embeddings.vocab:
        embedding_vector = embeddings[word]
        embedding_matrix[i] = embedding_vector
    else:
        not_found += 1

print('{} of {} tokens randomly initialized'.format(not_found, len(embeddings.index2word)))

15196 of 219475 tokens randomly initialized


In [68]:
vectors_tokens = []
for sentence in sentences_tokens:
    vector = []
    for word in sentence:
        vector.append(word2idx[word])
    vectors_tokens.append(vector)

In [69]:
len(vectors_tokens)

16275

In [70]:
vectors_tags = []
for sentence in sentences_tags:
    vector = []
    for tag in sentence:
        vector.append(tag2idx[tag])
    vectors_tags.append(vector)

In [71]:
len(vectors_tags)

16275

In [72]:
def token2char(token):
    word_char_idx = []
    for char in token:
        word_char_idx.append(char2idx[char])
    return np.array(word_char_idx)

In [73]:
def pad_nested_sequences(sequences, dtype='int32'):
    """Pads nested sequences to the same length.
    This function transforms a list of list sequences
    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.
    Args:
        sequences: List of lists of lists.
        dtype: Type of the output sequences.
    # Returns
        x: Numpy array.
    """
    max_sent_len = 0
    max_word_len = 0
    for sent in sequences:
        max_sent_len = max(len(sent), max_sent_len)
        for word in sent:
            max_word_len = max(len(word), max_word_len)

    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
    for i, sent in enumerate(sequences):
        for j, word in enumerate(sent):
            x[i, j, :len(word)] = word

    return x


In [74]:
MAX_LENGTH = max_seq_len  # max([len(sentence) for sentence in sent_text])

In [75]:
text_padded = sequence.pad_sequences(vectors_tokens, padding='post', maxlen=MAX_LENGTH, truncating='post')
tags_padded = sequence.pad_sequences(vectors_tags, padding='post', maxlen=MAX_LENGTH, truncating='post')

In [76]:
char_ids = [[token2char(idx2word[token_idx]) for token_idx in doc] for doc in vectors_tokens]
char_vectors_padded = pad_nested_sequences(char_ids)    

In [77]:
features = [np.array(text_padded), char_ids]

# LSTM char embdding + word embeddings + CRF model

In [78]:
use_char = True
use_crf = True

char_vocab_size = 10
char_embedding_dim = 25
char_lstm_size = 25
dropout = 0.3

word_lstm_size = 50

fc_dim = 100 # output fully-connected layer size
num_labels = len(tag2idx.keys())

In [79]:
from keras.utils import to_categorical

In [80]:
len(tag2idx.keys())
y = to_categorical(tags_padded, len(tag2idx.keys())).astype(int)
y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)

In [81]:
word_vocab_size = len(idx2word.keys())
word_embedding_dim = 100

In [82]:
from keras import metrics
from keras.layers import TimeDistributed, Dense, Input
from keras.layers import concatenate

In [83]:
n_words = len(idx2word.keys())
n_chars = len(idx2char.keys())
max_len = 183
max_len_char = 35
n_tags = len(idx2tag.keys())

## 80% train 20% test

In [84]:
split=0.8
train_test_split = int(len(text_padded)*split)

train_text_padded = np.array(text_padded)[0:train_test_split]
train_char_padded = char_vectors_padded[0:train_test_split]

test_text_padded = np.array(text_padded)[train_test_split:]
test_char_padded = char_vectors_padded[train_test_split:]

y_train = y[0:train_test_split]
y_test = y[train_test_split:]

train_features = [np.array(train_text_padded), train_char_padded]
test_features = [np.array(test_text_padded), test_char_padded]

# Neural Architectures for Named Entity Recognition (Lample et. al 2016)

In [97]:
use_crf = True
use_char = False

In [98]:
from keras.layers import Concatenate
from keras_contrib.layers import CRF

# Word Embeddings
word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input')
word_embeddings = Embedding(input_dim=n_words, output_dim=10,
                            mask_zero=True, name='word_embedding')(word_ids)

if use_char:
    # Character Embeddings
    char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
    char_embeddings = Embedding(input_dim=n_chars, output_dim=5,
                                mask_zero=True, name='char_embedding')(char_ids)

    # Apply a bi-LSTM to each char_embedding
    char_embeddings = TimeDistributed(Bidirectional(LSTM(units=10, return_sequences=False)))(char_embeddings)

    ## Concatenate all the vectors
    final_embeddings = Concatenate()([word_embeddings, char_embeddings])
else:
    final_embeddings = word_embeddings

# Pass it through a final LSTM
embeddings = Dropout(dropout)(final_embeddings)
z = Bidirectional(LSTM(units=word_lstm_size, return_sequences=True))(embeddings)
z = Dense(fc_dim, activation='tanh')(z)

if use_crf:
    crf = CRF(num_labels, learn_mode='join', sparse_target=True)
    loss = crf.loss_function
    pred = crf(z)
    metric=[crf.accuracy]
else:
    loss = 'categorical_crossentropy'
    pred = Dense(num_labels, activation='softmax')(z)
    metric = [metrics.categorical_accuracy]

model = Model([word_ids, char_ids], outputs=pred)
model.compile(optimizer='adam', loss=loss, metrics=metric)

In [100]:
#model.fit(train_features,y_train)

In [25]:
predictions = model.predict(test_features)

NameError: name 'model' is not defined

In [None]:
all_predictions = []
sentence_predictions = []

for sentence in predictions:
    for tag in sentence:
        sentence_predictions.append(idx2tag[np.argmax(tag)])
        all_tags.add(idx2tag[np.argmax(tag)])
    all_predictions.append(sentence_predictions)
    sentence_predictions = []

In [None]:
len(all_predictions)

In [None]:
all_true = []
for sentence in y_test:    
    sent_true = [idx2tag[np.argmax(tag)] for tag in sentence]
    all_true.append(sent_true)

In [None]:
len(all_true)

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report
print(flat_classification_report(all_true, all_predictions))