# Load Paramopama + HAREM II datasets

In [1]:
import numpy as np
from collections import defaultdict

sentences = []
tags = []
max_len_sentence = 0

tags_freq = defaultdict(int)
tokens_freq = defaultdict(int)

current_sentence_tokens = []
current_sentence_tags = []

with open("/Users/dsbatista/NER-datasets/Portuguese/Paramopama/corpus_paramopama+second_harem.txt", 'rt') as f_in:
    for line in f_in:
        if line != '\n':
            token, tag = line.split('\t')
            tags_freq[tag.strip()] += 1
            tokens_freq[token.strip()] += 1
            current_sentence_tokens.append(token)
            current_sentence_tags.append(tag.strip())
            if len(current_sentence_tokens)> max_len_sentence:
                max_len_sentence = len(current_sentence_tokens)
        else:
            sentences.append(current_sentence_tokens)
            tags.append(current_sentence_tags)
            current_sentence_tokens = []
            current_sentence_tags = []

In [2]:
len(sentences)

16275

In [3]:
tags_freq

defaultdict(int,
            {'LOCAL': 19326,
             'O': 333374,
             'ORGANIZACAO': 8747,
             'PESSOA': 11274,
             'TEMPO': 14079})

## _TODO: convert the tag schema to BIO_

In [4]:
def convert_schema(tags, schema='BIO'):
    print(tags)
    # TODO

## build a char-level embeddings

In [5]:
len(tokens_freq)

38530

In [6]:
char_matrix = defaultdict(np.array)
char_matrix['PADDING'] = np.random.uniform(-0.5,0.5,25)
char_matrix['UNKNOWN'] = np.random.uniform(-0.5,0.5,25)

In [7]:
for token in tokens_freq:
    for char in token:
        char_matrix[char] = np.random.uniform(-0.5,0.5,25)

In [8]:
token2idx = {token: idx for idx, token in enumerate(list(tokens_freq.keys()))}
idx2token = {idx: token for token, idx in token2idx.items()}

In [9]:
from keras.preprocessing.sequence import pad_sequences

max_token_lenght = max([len(token) for token in token2idx.keys()])

all_tokens = []
for token in list(token2idx.keys()):
    tmp = []
    for char in token:
        tmp.append(char_matrix[char])
    all_tokens.append(tmp)

Using TensorFlow backend.


In [10]:
len(all_tokens)

38530

In [11]:
len(tokens_freq)

38530

In [12]:
all_tokens_padded = pad_sequences(all_tokens, maxlen=max_token_lenght, 
                                  dtype='float32', padding='post', truncating='post', value=0.0)

In [13]:
token2idx['disse']

5362

In [14]:
from collections import defaultdict
from functools import partial

word_embeddings = defaultdict(partial(np.ndarray, 0))

## Load an word-embedding layer (_TODO: portuguese embeddings_)

In [15]:
import os

def load_fasttext_embeddings():
    glove_dir = '/Users/dsbatista/resources/glove.6B'
    embeddings_index = {}
    f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


def create_embeddings_matrix(embeddings_index, vocabulary, embedding_dim=100):
    embeddings_matrix = np.random.rand(len(vocabulary)+1, embedding_dim)
    for i, word in enumerate(vocabulary):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
    print('Matrix shape: {}'.format(embeddings_matrix.shape))
    return embeddings_matrix


def get_embeddings_layer(embeddings_matrix, name, max_len, trainable=False):
    embedding_layer = Embedding(
        input_dim=embeddings_matrix.shape[0],
        output_dim=embeddings_matrix.shape[1],
        input_length=max_len,
        weights=[embeddings_matrix],
        trainable=trainable,
        name=name)
    return embedding_layer


In [16]:
from keras.layers import LSTM, concatenate
from keras.layers import Input, Dense, Dropout, TimeDistributed
from keras.models import Model

In [17]:
embeddings_idx = load_fasttext_embeddings()

Loaded 400000 word vectors.


In [18]:
embeddings_matrix = create_embeddings_matrix(embeddings_idx, vocabulary=token2idx.keys())

Matrix shape: (38531, 100)


In [19]:
word_embeddings_layer = get_embeddings_layer(embeddings_matrix, 'word_embeddings', max_len_sentence)

NameError: name 'Embedding' is not defined

In [None]:
word_vocab_size = len(token2idx.keys())
word_embedding_dim=100
embeddings = embeddings_matrix

use_char = True
char_vocab_size=len(char_matrix.keys())
char_embedding_dim=25
char_lstm_size=25

dropout=0.5
word_lstm_size=50

use_crf = True

fc_dim=100
num_labels = len(set([tag for sent_tags in tags for tag in sent_tags]))

# biLSTM-CRF model

In [None]:
word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input')
inputs = [word_ids]

# word embeddings
if embeddings is None:
    word_embeddings = Embedding(input_dim=word_vocab_size,
                                output_dim=word_embedding_dim,
                                mask_zero=True,
                                name='word_embedding')(word_ids)
else:
    word_embeddings = Embedding(input_dim=embeddings.shape[0],
                                output_dim=embeddings.shape[1],
                                mask_zero=True,
                                weights=[embeddings],
                                name='word_embedding')(word_ids)

# build character based word embedding
if use_char:
    char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
    inputs.append(char_ids)
    char_embeddings = Embedding(input_dim=char_vocab_size,
                                output_dim=char_embedding_dim,
                                mask_zero=True,
                                name='char_embedding')(char_ids)
    char_embeddings = TimeDistributed(Bidirectional(LSTM(char_lstm_size)))(char_embeddings)
    
    print(inputs)
    print(word_embeddings.shape)
    print(char_embeddings.shape)
    
    word_embeddings = Concatenate()([word_embeddings, char_embeddings])

word_embeddings = Dropout(dropout)(word_embeddings)
z = Bidirectional(LSTM(units=word_lstm_size, return_sequences=True))(word_embeddings)
z = Dense(fc_dim, activation='tanh')(z)

if use_crf:
    crf = CRF(num_labels, sparse_target=False)
    loss = crf.loss_function
    pred = crf(z)
else:
    loss = 'categorical_crossentropy'
    pred = Dense(num_labels, activation='softmax')(z)

model = Model(inputs=inputs, outputs=pred)

## encode tags

In [None]:
tag2idx = {}
for idx, tag in enumerate(tags_freq.keys()):
    tag2idx[tag] = idx
idx2tag = {idx : tag for tag, idx in tag2idx.items()}

## build a word-level embeddings and pad sentences

In [None]:
sentence_tokens_embeddings = []
tags_encoded = []

for sentence, sent_tags in zip(sentences, tags):
    sent_embds = []
    sent_tags_enc = []
    for word, tag in zip(sentence, sent_tags):
        if word in embeddings_idx:
            embedding = embeddings_idx[word]
        else:
            embedding = np.random.rand(100)
        sent_embds.append(embedding)
        sent_tags_enc.append(tag2idx[tag])
    sentence_tokens_embeddings.append(sent_embds)
    tags_encoded.append(sent_tags_enc)

In [None]:
len(sentence_tokens_embeddings)

In [None]:
len(tags_encoded)

In [None]:
padded_sentences_encoded = pad_sequences(sentence_tokens_embeddings)

In [None]:
len(padded_sentences_encoded)

In [None]:
model.fit(padded_sentences_encoded)