# A Sequential Network for Named Entity Recognition

Author: Pierre Nugues


In the lab on named entity recognition, we used the words to predict the named entities. This is a simplified version of it.

## The modules

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras import Input
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

# Reading the Corpus

In [None]:
vilde = False
if vilde:
    BASE_DIR = '/home/pierre/Cours/EDAN20/corpus/CoNLL2003/'
else:
    BASE_DIR = '/Users/pierre/Projets/Corpora/CoNLL2003/'


def load_conll2003_en():
    train_file = BASE_DIR + 'NER-data/eng.train'
    dev_file = BASE_DIR + 'NER-data/eng.valid'
    test_file = BASE_DIR + 'NER-data/eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

### The dictorizer that transforms the CoNLL files into dictionaries

In [None]:
import regex as re

class Token(dict):
    pass

class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [None]:
train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
train_dict = conll_dict.transform(train_sentences)
print(train_dict[0])
print(train_dict[1])

## Building the sequences

### The function to build the sequences

In [None]:
def build_sequences(corpus_dict, key_x='form', key_y='pos', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return:
    """
    X = []
    Y = []
    for sentence in corpus_dict:
        x = [word[key_x] for word in sentence]
        y = [word[key_y] for word in sentence]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

### We build the words and NER sequence tags

In [None]:
X_words, Y_ner = build_sequences(train_dict, key_x='form', key_y='ner')
print('First sentence, words', X_words[1])
print('First sentence, NER', Y_ner[1])

### We now extract the list of unique words and NER

In [None]:
word_set = sorted(list(set([item for sublist in X_words for item in sublist])))
ner_set = sorted(list(set([item for sublist in Y_ner for item in sublist])))
print(len(word_set))
print(len(ner_set))
ner_set

## Building the indices

In [None]:
idx_word = dict(enumerate(word_set, start=2))
idx_ner = dict(enumerate(ner_set, start=2))
word_idx = {v: k for k, v in idx_word.items()}
ner_idx = {v: k for k, v in idx_ner.items()}

## Converting the matrices
We convert the matrices into numbers

Before: We have the symbols

In [None]:
print(X_words[1])
Y_ner[1]

In [None]:
X_words_idx = [list(map(lambda x: word_idx.get(x, 1), x)) for x in X_words]
Y_ner_idx = [list(map(lambda x: ner_idx.get(x, 1), x)) for x in Y_ner]

After: We have the indices

In [None]:
print(X_words_idx[1])
Y_ner_idx[1]

### We pad the sequences

In [None]:
X_words_idx = pad_sequences(X_words_idx, padding='post')
Y_ner_idx = pad_sequences(Y_ner_idx, padding='post')

In [None]:
print(X_words_idx[1])
Y_ner_idx[1]

### We create one encodings for the output

In [None]:
Y_ner_idx_cat = to_categorical(Y_ner_idx)
print(Y_ner_idx_cat[1])
print(Y_ner_idx_cat[1][0])
Y_ner_idx_cat[1][-1]

## The sequential network

### The word input

In [None]:
text_vocabulary_size = len(word_set) + 2

model = Sequential([
    Embedding(text_vocabulary_size,
                    64,
                    mask_zero=True),
    LSTM(32, return_sequences=True),
    Dense(len(ner_set) + 2, activation='softmax')])

### Compiling it 

In [None]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])

### We fit the model

In [None]:
model.fit(X_words_idx, Y_ner_idx_cat,
                  epochs=3, batch_size=128)