In [27]:
# This is dataset is available at https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus 
# You will also need spacy and the 'en_core_web_lg' or 'en_core_web_md' model isntalled.
# the embedding_matrix function uses 'en_core_web_lg' by default, if you have 'en_core_web_md' instead change
# nlp = spacy.load('en_core_web_lg') to nlp = spacy.load('en_core_web_md') in word2vec_functions.py

In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

In [2]:
from SentenceGetter import SentenceGetter
from word2vec_functions import embedding_matrix

In [3]:
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint,EarlyStopping
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [5]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint,EarlyStopping

In [6]:
file = os.path.join(os.getcwd(),"ner_dataset.csv")
df= pd.read_csv(file, encoding="latin1")
df= df.fillna(method='ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [7]:
words = list(set(df["Word"].values))
words.append("ENDPAD")

tags = df['Tag'].unique().tolist()

n_words = len(words)
n_tags = len(tags)

In [8]:
getter = SentenceGetter(df)
sentences = getter.sentences

max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [9]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [10]:
y = [to_categorical(i, num_classes=n_tags) for i in y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
model_filename = os.path.join(os.getcwd(),'models','ner_dataset','word2vec_model2.h5')

plateau_callback =ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=2)
modelcheckpoint_callback = ModelCheckpoint(filepath=model_filename,monitor='val_accuracy',save_best_only=True)
earlystop_callback = EarlyStopping(monitor='val_accuracy',patience=3)

In [12]:
word2vec_embeddings = embedding_matrix(n_words,word2idx)

In [13]:
input = Input(shape=(max_len,))
network_1 = Embedding(input_dim=n_words + 1, 
                      output_dim=300,
                      weights=[word2vec_embeddings],
                      input_length=max_len,
                      trainable=False)(input)
network_1 = Dropout(0.2)(network_1)
network_1 = Bidirectional(LSTM(units=300, return_sequences=True, recurrent_dropout=0.2))(network_1)
#model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(network_1)  # softmax output layer

model1 = Model(input, out)
model1.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
history1 = model1.fit(X_train, np.array(y_train), 
                      batch_size=32, epochs=30, 
                      validation_split=0.3, verbose=1,
                      callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback])

Instructions for updating:
Use tf.cast instead.
Train on 26856 samples, validate on 11511 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


In [15]:
input = Input(shape=(max_len,))
network_2 = Embedding(input_dim=n_words + 1, 
                      output_dim=300,
                      weights=[word2vec_embeddings],
                      input_length=max_len,
                      trainable=False)(input)
network_2 = Dropout(0.2)(network_2)
network_2 = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2))(network_2)
#model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(network_2)  # softmax output layer

model2 = Model(input, out)
model2.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [17]:
history2 = model2.fit(X_train, np.array(y_train), 
                      batch_size=32, epochs=30, 
                      validation_split=0.1, verbose=1,
                      callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback])

Train on 34530 samples, validate on 3837 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


In [18]:
preds1 = model1.predict(np.array(X_test), verbose=1)
preds2 = model2.predict(np.array(X_test), verbose=1)



In [21]:
def convert2label(y_matrix,idx2tag):
    result = []
    for y_vector1 in y_matrix:
        output = []
        for y_vector2 in y_vector1:
            max_val = np.argmax(y_vector2)
            output.append(idx2tag[max_val].replace("ENDPAD", "O"))
        result.append(output)
    return result

In [23]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [24]:
idx2tag = {i: w for w, i in tag2idx.items()}

pred_labels = convert2label(preds1,idx2tag)
test_labels = convert2label(preds1,idx2tag)
print(classification_report(test_labels, pred_labels))

           precision    recall  f1-score   support

      org       1.00      1.00      1.00      3973
      eve       1.00      1.00      1.00        48
      tim       1.00      1.00      1.00      4018
      art       1.00      1.00      1.00        21
      gpe       1.00      1.00      1.00      3053
      per       1.00      1.00      1.00      3439
      nat       1.00      1.00      1.00        24
      geo       1.00      1.00      1.00      7991

micro avg       1.00      1.00      1.00     22567
macro avg       1.00      1.00      1.00     22567



In [25]:
pred_labels = convert2label(preds2,idx2tag)
test_labels = convert2label(preds2,idx2tag)
print(classification_report(test_labels, pred_labels))

           precision    recall  f1-score   support

      org       1.00      1.00      1.00      3930
      eve       1.00      1.00      1.00        41
      tim       1.00      1.00      1.00      4024
      art       1.00      1.00      1.00        14
      gpe       1.00      1.00      1.00      3057
      per       1.00      1.00      1.00      3432
      nat       1.00      1.00      1.00        15
      geo       1.00      1.00      1.00      8077

micro avg       1.00      1.00      1.00     22590
macro avg       1.00      1.00      1.00     22590

