Dataset Loader

In [1]:
def read_data(fname):
    data = [[]]
    ners = [[]]
    with open(fname) as f:
        for line in f:
            if line == '\n':
                data.append([])
                ners.append([])
            else:
                word, _, _, label = line.strip().split()
                if word == '-DOCSTART-':
                    data.pop()
                    ners.pop()
                    continue
                else:
                    data[-1].append(word)
                    ners[-1].append(label)
        #print(word, label)
    data.pop()
    ners.pop()
    return data, ners

Load datasets

In [2]:
import pandas as pd
data, ner = read_data('data/eng.train')
train = pd.DataFrame(data={'sentences':data, 'labels':ner})
data, ner = read_data('data/eng.testa')
dev = pd.DataFrame(data={'sentences':data, 'labels':ner})
data, ner = read_data('data/eng.testb')
test = pd.DataFrame(data={'sentences':data, 'labels':ner})

Lowercase words

In [3]:
train['low'] = train['sentences'].map(lambda x: [word.lower() for word in x])
dev['low'] = dev['sentences'].map(lambda x: [word.lower() for word in x])
test['low'] = test['sentences'].map(lambda x: [word.lower() for word in x])

In [4]:
from itertools import chain
from gensim.models import Word2Vec

w2v = Word2Vec.load_word2vec_format('data/aquaint+wiki.txt.gz.ndim=50.bin', binary=True)
dictionary = {'PAD':0, 'UNK':1}

toks = (set(chain.from_iterable(train['low'])) | set(chain.from_iterable(test['low'])) | \
       set(chain.from_iterable(dev['low'])))

i = 2
for _, tok in enumerate(toks):
    if tok in w2v:
        dictionary[tok] = i
        i+=1
len(dictionary)

23836

In [5]:
def word2id(sent):
    return map(lambda x: dictionary.get(x, 1), sent)

train['wids'] = train['low'].map(word2id)
dev['wids'] = dev['low'].map(word2id)
test['wids'] = test['low'].map(word2id)

Case Feats

In [6]:
def case(word):
    if word.isupper():
        return 2
    elif word.istitle():
        return 3
    elif any(char.isupper() for char in word):
        return 4
    else:
        return 1
    
train['cap'] = train['sentences'].map(lambda x: [case(word) for word in x])
dev['cap'] = dev['sentences'].map(lambda x: [case(word) for word in x])
test['cap'] = test['sentences'].map(lambda x: [case(word) for word in x])

Is Number

In [7]:
def numb(word):
    if all(char.isdigit() for char in word):
        return 2
    elif any(char.isdigit() for char in word):
        return 3
    else:
        return 1

train['num'] = train['sentences'].map(lambda x: [numb(word) for word in x])
dev['num'] = dev['sentences'].map(lambda x: [numb(word) for word in x])
test['num'] = test['sentences'].map(lambda x: [numb(word) for word in x])

Prepare labels

In [8]:
classes = set(chain.from_iterable(pd.concat([train['labels'],dev['labels'],test['labels']])))
labels_dict = {'PAD':0}
for idx, lab in enumerate(classes, 1):
    labels_dict[lab] = idx
print(labels_dict)
maxlen = max(len(sent) for sent in pd.concat([train['sentences'],dev['sentences'],test['sentences']]))
print(maxlen)

    
train['lids'] = train['labels'].map(lambda sent: map(lambda w: labels_dict[w], sent))
dev['lids'] = dev['labels'].map(lambda sent: map(lambda w: labels_dict[w], sent))
test['lids'] = test['labels'].map(lambda sent: map(lambda w: labels_dict[w], sent))

{'I-LOC': 1, 'B-ORG': 2, 'I-PER': 4, 'PAD': 0, 'O': 3, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 7, 'B-LOC': 8}
124


Pad everything:

In [9]:
from keras.preprocessing.sequence import pad_sequences

train['y_g'] = train['lids'].apply(lambda s: pad_sequences([s], maxlen)[0])
train['X'] = train['wids'].apply(lambda s: pad_sequences([s], maxlen)[0])
train['X_num'] = train['num'].apply(lambda s: pad_sequences([s], maxlen)[0])
train['X_cap'] = train['cap'].apply(lambda s: pad_sequences([s], maxlen)[0])

dev['y_g'] = dev['lids'].apply(lambda s: pad_sequences([s], maxlen)[0])
dev['X'] = dev['wids'].apply(lambda s: pad_sequences([s], maxlen)[0])
dev['X_num'] = dev['num'].apply(lambda s: pad_sequences([s], maxlen)[0])
dev['X_cap'] = dev['cap'].apply(lambda s: pad_sequences([s], maxlen)[0])

test['y_g'] = test['lids'].apply(lambda s: pad_sequences([s], maxlen)[0])
test['X'] = test['wids'].apply(lambda s: pad_sequences([s], maxlen)[0])
test['X_num'] = test['num'].apply(lambda s: pad_sequences([s], maxlen)[0])
test['X_cap'] = test['cap'].apply(lambda s: pad_sequences([s], maxlen)[0])


Using TensorFlow backend.


In [10]:
train[:5]

Unnamed: 0,labels,sentences,low,wids,cap,num,lids,y_g,X,X_num,X_cap
0,"[I-ORG, O, I-MISC, O, O, O, I-MISC, O, O]","[EU, rejects, German, call, to, boycott, Briti...","[eu, rejects, german, call, to, boycott, briti...","[15655, 14924, 4010, 17820, 8821, 16288, 3898,...","[2, 1, 3, 1, 1, 1, 3, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[7, 3, 5, 3, 3, 3, 5, 3, 3]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[I-PER, I-PER]","[Peter, Blackburn]","[peter, blackburn]","[5650, 13774]","[3, 3]","[1, 1]","[4, 4]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[I-LOC, O]","[BRUSSELS, 1996-08-22]","[brussels, 1996-08-22]","[6081, 1]","[2, 1]","[1, 3]","[1, 3]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[O, I-ORG, I-ORG, O, O, O, O, O, O, I-MISC, O,...","[The, European, Commission, said, on, Thursday...","[the, european, commission, said, on, thursday...","[15201, 546, 14547, 18124, 22194, 10190, 15841...","[3, 3, 3, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 7, 7, 3, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[I-LOC, O, O, O, O, I-ORG, I-ORG, O, O, O, I-P...","[Germany, 's, representative, to, the, Europea...","[germany, 's, representative, to, the, europea...","[3242, 712, 1931, 8821, 15201, 546, 494, 712, ...","[3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 3, 3, 3, 3, 7, 7, 3, 3, 3, 4, 4, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
from keras.utils.np_utils import to_categorical
import numpy as np

def labels_to_prob(labs):
    return np.vstack([to_categorical(lab, len(classes)+1) for lab in labs])
    
train['y'] = train['y_g'].map(labels_to_prob)
dev['y'] = dev['y_g'].map(labels_to_prob)
test['y'] = test['y_g'].map(labels_to_prob)

In [12]:
def evalu(y_all, y_gold, tokens):

    tokens = dict((i, lab) for lab, i in tokens.items())
    tagged_y = []
    tagged_gold = []
    with open("out.txt", 'w') as f:
        for seq, gold in zip(y_all, y_gold):
            seq =  seq[-len(gold):]
            tmp_y = []
            tmp_g = []
            for y_word, g_word in zip(seq, gold):
                tmp_y.append(tokens[y_word] if y_word !=0 else u'0')
                tmp_g.append(tokens[g_word])
            tagged_y.append(tmp_y)
            tagged_gold.append(tmp_g)
        for y_seq, g_seq in zip(tagged_y, tagged_gold):
            for y_word, g_word in zip(y_seq,g_seq):
                f.write(" ".join(["WORD","POS", g_word, y_word])+'\n')
            f.write('\n')
    
    
evalu([[0, 2, 3]], [[0,0,0]], labels_dict)

In [13]:
!env LANG=C perl conlleval.pl < out.txt

processed 3 tokens with 0 phrases; found: 1 phrases; correct: 0.
accuracy:   0.00%; precision:   0.00%; recall:   0.00%; FB1:   0.00
              ORG: precision:   0.00%; recall:   0.00%; FB1:   0.00  1


In [14]:
def emb_matrix(dictionary, model):
    embedding_matrix = np.zeros((len(dictionary), 50))
    for word in dictionary:
        if word in model:
            embedding_matrix[dictionary[word]] = model[word]
    return embedding_matrix

In [15]:
import numpy as np
np.random.seed(42)

from keras.models import Model, Sequential
from keras.layers import (Input,
                          Embedding,
                          Convolution1D,
                          TimeDistributed,
                          Dense)

from keras.optimizers import Adam

model = Sequential()
model.add(Embedding(len(dictionary), 50 ,input_length=maxlen, weights=[emb_matrix(dictionary, w2v)], trainable=True, mask_zero=True))
model.add(TimeDistributed(Dense(100, activation='relu')))
model.add(TimeDistributed(Dense(len(classes)+1, activation='softmax')))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 124, 50)           1191800   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 124, 100)          5100      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 124, 9)            909       
Total params: 1,197,809
Trainable params: 1,197,809
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [17]:
model.fit(np.vstack(train['X'].tolist()),
          np.array(train['y'].tolist()),
          batch_size=100,
          epochs=5,
          validation_data=(np.vstack(dev['X'].tolist()), np.array(dev['y'].tolist())),
          shuffle=True)

Train on 14041 samples, validate on 3251 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1976cff50>

In [18]:
preds = model.predict_classes(np.vstack(train['X'].tolist()))
gold = np.array(train['y_g'].tolist())
evalu(preds, gold, labels_dict)



In [19]:
!env LANG=C perl conlleval.pl < out.txt

processed 1741084 tokens with 23499 phrases; found: 24312 phrases; correct: 19168.
accuracy:  11.30%; precision:  78.84%; recall:  81.57%; FB1:  80.18
              LOC: precision:  79.46%; recall:  87.59%; FB1:  83.33  7871
             MISC: precision:  77.23%; recall:  72.51%; FB1:  74.80  3228
              ORG: precision:  68.39%; recall:  69.45%; FB1:  68.92  6419
              PER: precision:  88.77%; recall:  91.38%; FB1:  90.06  6794


In [20]:
preds = model.predict_classes(np.vstack(dev['X'].tolist()))
gold = np.array(dev['y_g'].tolist())
evalu(preds, gold, labels_dict)



In [21]:
!env LANG=C perl conlleval.pl < out.txt

processed 403124 tokens with 5942 phrases; found: 6107 phrases; correct: 4468.
accuracy:  12.14%; precision:  73.16%; recall:  75.19%; FB1:  74.16
              LOC: precision:  78.16%; recall:  82.58%; FB1:  80.31  1941
             MISC: precision:  69.20%; recall:  62.15%; FB1:  65.49  828
              ORG: precision:  57.40%; recall:  60.70%; FB1:  59.01  1418
              PER: precision:  81.46%; recall:  84.91%; FB1:  83.15  1920


In [39]:
import numpy as np
np.random.seed(42)

from keras.models import Model, Sequential
from keras.layers import (Input,
                          Embedding,
                          Convolution1D,
                          TimeDistributed,
                          Dense,
                          concatenate)

from keras.optimizers import Adam

word = Input(shape=(maxlen,))
cap = Input(shape=(maxlen,))

cap_emb = Embedding(5, 5,input_length=maxlen)(cap)
word_emb = Embedding(len(dictionary), 50 , weights=[emb_matrix(dictionary, w2v)],input_length=maxlen, trainable=True)(word)

emb = concatenate([word_emb, cap_emb])

model_s = Sequential()
model_s.add(TimeDistributed(Dense(100, activation='relu'), input_shape=(maxlen, 55)))
model_s.add(TimeDistributed(Dense(len(classes)+1, activation='softmax')))

out = model_s(emb)

model = Model(inputs=[word, cap], outputs=[out])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 124)           0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 124)           0                                            
____________________________________________________________________________________________________
embedding_9 (Embedding)          (None, 124, 50)       1191800     input_5[0][0]                    
____________________________________________________________________________________________________
embedding_8 (Embedding)          (None, 124, 5)        25          input_6[0][0]                    
___________________________________________________________________________________________

In [40]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [41]:
model.fit([np.vstack(train['X'].tolist()),np.vstack(train['X_cap'].tolist())],
          np.array(train['y'].tolist()),
          batch_size=100,
          epochs=10,
          validation_data=([np.vstack(dev['X'].tolist()),np.vstack(dev['X_cap'].tolist())], np.array(dev['y'].tolist())),
          shuffle=True)

Train on 14041 samples, validate on 3251 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b408be90>

In [42]:
preds = [np.argmax(x, axis=1) for x in model.predict([np.vstack(dev['X'].tolist()), np.vstack(dev['X_cap'].tolist())])]
gold = np.array(dev['y_g'].tolist())
evalu(preds, gold, labels_dict)

In [43]:
!env LANG=C perl conlleval.pl < out.txt

processed 403124 tokens with 5942 phrases; found: 6568 phrases; correct: 4801.
accuracy:  12.33%; precision:  73.10%; recall:  80.80%; FB1:  76.75
                 : precision:   0.00%; recall:   0.00%; FB1:   0.00  1
              LOC: precision:  80.68%; recall:  83.89%; FB1:  82.25  1910
             MISC: precision:  68.29%; recall:  73.10%; FB1:  70.61  987
              ORG: precision:  54.74%; recall:  68.46%; FB1:  60.83  1677
              PER: precision:  83.69%; recall:  90.55%; FB1:  86.99  1993


In [59]:
import numpy as np
np.random.seed(42)

from keras.models import Model, Sequential
from keras.layers import (Input,
                          Embedding,
                          Convolution1D,
                          TimeDistributed,
                          Dense)

from keras.optimizers import Adam

model = Sequential()
model.add(Embedding(len(dictionary), 50 ,input_length=maxlen, weights=[emb_matrix(dictionary, w2v)], trainable=True, mask_zero=True))
model.add(LSTM(100, activation='tanh', return_sequences=True))
model.add(TimeDistributed(Dense(len(classes)+1, activation='softmax')))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 124, 50)           1191800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 124, 100)          60400     
_________________________________________________________________
time_distributed_13 (TimeDis (None, 124, 9)            909       
Total params: 1,253,109
Trainable params: 1,253,109
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [61]:
model.fit(np.vstack(train['X'].tolist()),
          np.array(train['y'].tolist()),
          batch_size=100,
          epochs=2,
          validation_data=(np.vstack(dev['X'].tolist()), np.array(dev['y'].tolist())),
          shuffle=True)

Train on 14041 samples, validate on 3251 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d1c86b50>

In [62]:
preds = model.predict_classes(np.vstack(dev['X'].tolist()))
gold = np.array(dev['y_g'].tolist())
evalu(preds, gold, labels_dict)



In [63]:
!env LANG=C perl conlleval.pl < out.txt

processed 403124 tokens with 5942 phrases; found: 6039 phrases; correct: 4797.
accuracy:  12.28%; precision:  79.43%; recall:  80.73%; FB1:  80.08
              LOC: precision:  87.03%; recall:  84.76%; FB1:  85.88  1789
             MISC: precision:  69.13%; recall:  71.15%; FB1:  70.12  949
              ORG: precision:  63.10%; recall:  71.29%; FB1:  66.95  1515
              PER: precision:  91.15%; recall:  88.38%; FB1:  89.75  1786


CRF output is not supported in Keras yet. However you can find an implementation here


https://github.com/phipleg/keras

A fully working implementation of state of the art networks for NER can be found there:

https://github.com/glample/tagger

That implements the state of art architecture https://arxiv.org/abs/1603.01360