In [19]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.utils import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("/content/ner_dataset.csv", encoding="latin1")
data=data.fillna(method="ffill")
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [4]:
print("Unique words in corpus:",data['Word'].nunique())
print("Unique tags in corpus:",data['Tag'].nunique())

Unique words in corpus: 23008
Unique tags in corpus: 17


In [5]:
words=list(set(data["Word"].values))
words.append("ENDPAD")
num_words=len(words)

In [6]:
tags=list(set(data["Tag"].values))
num_tags=len(tags)

In [7]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent=1
        self.data=data
        self.empty=False
        agg_func=lambda s:[(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped=self.data.groupby("Sentence #").apply(agg_func)
        self.sentences=[s for s in self.grouped]
    
    def get_next(self):
        try:
            s=self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent+=1
            return s
        except:
            return None

In [8]:
getter=SentenceGetter(data)
sentences=getter.sentences

In [9]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [11]:
word2idx={w:i + 1 for i, w in enumerate(words)}
tag2idx={t:i for i, t in enumerate(tags)}

In [12]:
word2idx

{'twice': 1,
 'reflects': 2,
 'beheaded': 3,
 'open-ended': 4,
 'professionalism': 5,
 'SCHROEDER': 6,
 'calmer': 7,
 'Hugh': 8,
 'Jean-Cyril': 9,
 'generals': 10,
 '28': 11,
 'claim': 12,
 'George': 13,
 'Vines': 14,
 'cancel': 15,
 'guaranteeing': 16,
 'Pekanbara': 17,
 'Completing': 18,
 'ingredients': 19,
 'Kajaki': 20,
 'hunts': 21,
 'expansion': 22,
 'Hong-Kong-based': 23,
 'contractual': 24,
 'tunnel': 25,
 '15': 26,
 'spot': 27,
 'Bode': 28,
 'Lefevre': 29,
 'Holocaust': 30,
 're-stating': 31,
 'errant': 32,
 '240': 33,
 'Fleihan': 34,
 'Out': 35,
 'positive': 36,
 'entirely': 37,
 'overhead': 38,
 'Pingtung': 39,
 'exceptional': 40,
 'pushed': 41,
 'upgrading': 42,
 'Redmond': 43,
 '1621': 44,
 'zoo': 45,
 '994': 46,
 'bet': 47,
 'Griffal': 48,
 '%': 49,
 'Likewise': 50,
 'dialog': 51,
 'reformists': 52,
 'hopefuls': 53,
 'Technology': 54,
 'editorials': 55,
 'Kissem': 56,
 'match-up': 57,
 'stockpiled': 58,
 'rifles': 59,
 'Rotterdam': 60,
 'shall': 61,
 'opposition': 62,
 'S

In [14]:
max_len=50
X=[[word2idx[w[0]] for w in s] for s in sentences]
X=pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
y=[[tag2idx[w[2]] for w in s] for s in sentences]
y=pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [15]:
x_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=1)

In [17]:
input_word=Input(shape=(max_len,))
model=Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_word)
model=SpatialDropout1D(0.1)(model)
model=Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out=TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model=Model(input_word, out)
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1150450   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 17)           3417      
 ibuted)                                                         
                                                             

In [18]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [20]:
chkpt=ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
early_stopping=EarlyStopping(monitor='val_accuracy', min_delta=0, patience=1, verbose=0, mode='max', baseline=None, restore_best_weights=False)
history=model.fit(x=x_train, y=y_train, validation_data=(x_test,y_test), batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
model.evaluate(x_test, y_test)



[0.06279795616865158, 0.9825166463851929]

In [22]:
i=np.random.randint(0, x_test.shape[0]) 
p=model.predict(np.array([x_test[i]]))
p=np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))

Word           True 	 Pred

------------------------------
President      B-per	B-per
Bush           I-per	I-per
said           O	O
nations        O	O
around         O	O
the            O	O
world          O	O
need           O	O
to             O	O
stand          O	O
with           O	O
moderate       O	O
reformers      O	O
in             O	O
the            O	O
MIddle         B-geo	O
East           I-geo	I-geo
.              O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
deteriorated   O	O
d