In [None]:
!pip install plot_keras_history
!pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install tensorflow_addons

!pip install tensorflow==2.1.0
!pip install keras==2.3.1

In [21]:
import re
import string
import warnings
import numpy as np
import pandas as pd
from keras import Sequential
from keras import optimizers
from keras.models import Model
from keras.models import Input
import matplotlib.pyplot as plt
from keras_contrib import losses
from keras_contrib import metrics
from keras_contrib.layers import CRF
from plot_keras_history import plot_history
from keras_contrib.utils import save_load_utils
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense

In [2]:
data = pd.read_csv("ner_dataset.csv", encoding = "iso-8859-1", header = 0)

data = data.fillna(method = "ffill")
data["Sentence #"] = data["Sentence #"].apply(lambda x: x[9:])
data["Sentence #"] = data["Sentence #"].astype("int32")
data.drop("POS", axis = 1, inplace = True)
data

Unnamed: 0,Sentence #,Word,Tag
0,1,Thousands,O
1,1,of,O
2,1,demonstrators,O
3,1,have,O
4,1,marched,O
...,...,...,...
1048570,47959,they,O
1048571,47959,responded,O
1048572,47959,to,O
1048573,47959,the,O


In [3]:
word_counts = data.groupby("Sentence #")["Word"].agg(["count"])
word_counts = word_counts.rename(columns = {"count": "Word count"})
word_counts

Unnamed: 0_level_0,Word count
Sentence #,Unnamed: 1_level_1
1,24
2,30
3,14
4,15
5,25
...,...
47955,20
47956,24
47957,11
47958,11


In [4]:
len_max = word_counts.max()[0]
longest_sentence_id = word_counts[word_counts["Word count"] == len_max].index[0]
longest_sentence = data[data["Sentence #"] == longest_sentence_id]["Word"].str.cat(sep = ' ')
all_words = list(set(data["Word"].values))
all_tags = list(set(data["Tag"].values))

In [5]:
word2index = {word: idx + 2 for idx, word in enumerate(all_words)}
word2index["--UNKNOWN_WORD--"] = 0
word2index["--PADDING--"] = 1
index2word = {idx: word for word, idx in word2index.items()}

In [6]:
tag2index = {tag: idx + 1 for idx, tag in enumerate(all_tags)}
tag2index["--PADDING--"] = 0
index2tag = {idx: word for word, idx in tag2index.items()}

In [7]:
def to_tuples(data):
  iterator = zip(data["Word"].values.tolist(), data["Tag"].values.tolist())
  return [(word, tag) for word, tag in iterator]

sentences = data.groupby("Sentence #").apply(to_tuples).tolist()

In [8]:
sentences

[[('Thousands', 'O'),
  ('of', 'O'),
  ('demonstrators', 'O'),
  ('have', 'O'),
  ('marched', 'O'),
  ('through', 'O'),
  ('London', 'B-geo'),
  ('to', 'O'),
  ('protest', 'O'),
  ('the', 'O'),
  ('war', 'O'),
  ('in', 'O'),
  ('Iraq', 'B-geo'),
  ('and', 'O'),
  ('demand', 'O'),
  ('the', 'O'),
  ('withdrawal', 'O'),
  ('of', 'O'),
  ('British', 'B-gpe'),
  ('troops', 'O'),
  ('from', 'O'),
  ('that', 'O'),
  ('country', 'O'),
  ('.', 'O')],
 [('Families', 'O'),
  ('of', 'O'),
  ('soldiers', 'O'),
  ('killed', 'O'),
  ('in', 'O'),
  ('the', 'O'),
  ('conflict', 'O'),
  ('joined', 'O'),
  ('the', 'O'),
  ('protesters', 'O'),
  ('who', 'O'),
  ('carried', 'O'),
  ('banners', 'O'),
  ('with', 'O'),
  ('such', 'O'),
  ('slogans', 'O'),
  ('as', 'O'),
  ('"', 'O'),
  ('Bush', 'B-per'),
  ('Number', 'O'),
  ('One', 'O'),
  ('Terrorist', 'O'),
  ('"', 'O'),
  ('and', 'O'),
  ('"', 'O'),
  ('Stop', 'O'),
  ('the', 'O'),
  ('Bombings', 'O'),
  ('.', 'O'),
  ('"', 'O')],
 [('They', 'O'),
  ('ma

In [9]:
X = [[word[0] for word in sentence] for sentence in sentences]
y = [[word[1] for word in sentence] for sentence in sentences]

X = [[word2index[word] for word in sentence] for sentence in X]
y = [[tag2index[tag] for tag in sentence] for sentence in y]

X = [sentence + [word2index["--PADDING--"]] * (len_max - len(sentence)) for sentence in X]
y = [sentence + [tag2index["--PADDING--"]] * (len_max - len(sentence)) for sentence in y]

num_tags = len(tag2index)
y = [np.eye(num_tags)[sentence] for sentence in y]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [18]:
input_layer = Input(shape = (len_max,))
model = Embedding(len(index2word), 50, embeddings_initializer = "uniform", input_length = len_max)(input_layer)
model = Bidirectional(LSTM(50, return_sequences = True))(model)
model = TimeDistributed(Dense(100, activation = "relu"))(model)

crf_layer = CRF(units = num_tags)
output_layer = crf_layer(model)

ner_model = Model(input_layer, output_layer)

loss = losses.crf_loss
acc_metric = metrics.crf_accuracy

ner_model.compile(optimizer = 'nadam', loss = losses.crf_loss, metrics = [acc_metric])
ner_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 104)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 104, 50)           1759000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 104, 100)          40400     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 104, 100)          10100     
_________________________________________________________________
crf_2 (CRF)                  (None, 104, 18)           2178      
Total params: 1,811,678
Trainable params: 1,811,678
Non-trainable params: 0
_________________________________________________________________


In [22]:
warnings.filterwarnings("ignore")
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 2)

history = ner_model.fit(X_train, y_train, batch_size = 256, epochs = 5, validation_data = (X_test, y_test), callbacks = [es])

Train on 38367 samples, validate on 9592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
y_test = np.argmax(y_test, axis = 2)

In [24]:
y_pred = np.argmax(ner_model.predict(X_test), axis = 2)
acc = (y_pred == y_test).mean()
print("Accuracy: {:.4f}".format(acc))

Accuracy: 0.9925


In [25]:
sentence = "John lives in New York"

In [26]:
re_tok = re.compile(f"([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])")
sentence = re_tok.sub(r"  ", sentence).split()

pad_sentence = sentence + [word2index["--PADDING--"]] * (len_max - len(sentence))
pad_sentence = [word2index.get(w, 0) for w in pad_sentence]

pred = ner_model.predict(np.array([pad_sentence]))
pred = np.argmax(pred, axis = -1)

entity = ""
for w, p in zip(sentence, pred[0]):
  entity = entity + "{:15}: {:5}".format(w, index2tag[p]) + "\n"
print(entity)

John           : B-per
lives          : O    
in             : O    
New            : B-geo
York           : I-geo

