# Named Entity Recognition with Python

In [1]:
import pandas as pd

data = pd.read_csv("ner_dataset.csv",encoding= 'unicode_escape')
data.head(5)

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Sentence: 1,Thousands,NNS,O
1,1,,of,IN,O
2,2,,demonstrators,NNS,O
3,3,,have,VBP,O
4,4,,marched,VBN,O


In [7]:
from itertools import chain

def make_dict_map(data, tokentag):
    token_to_idx = {}
    idx_to_token = {}
    #Checking for tokentag to filter   
    if tokentag == 'token':
        voc = list(set(data['Word'].to_list()))
    else:
        voc = list(set(data['Tag'].to_list()))

    #Creating dictionary for idx_to_token and token_to_idx
    idx_to_token = {idx:tok for  idx, tok in enumerate(voc)}
    token_to_idx = {tok:idx for  idx, tok in enumerate(voc)}
    return token_to_idx , idx_to_token


In [8]:
#Filtering the token and tag using make_dict_map function
token_to_idx, idx_to_token = make_dict_map(data, 'token')
tag_to_idx, idx_to_tag = make_dict_map(data, 'tag')

In [13]:
data['Word_idx'] = data['Word'].map(token_to_idx)
data['Tag_idx'] = data['Tag'].map(tag_to_idx)

#Filling the Nan values in the dataset
data_fillna = data.fillna(method='ffill', axis=0)

  data_fillna = data.fillna(method='ffill', axis=0)


In [16]:
# Groupby and collect columns

data_group = data_fillna.groupby(['Sentence #'],
                                 as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))

In [17]:
data_group.head(5)

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[11985, 34449, 33130, 12826, 16723, 102, 19925...","[2, 2, 2, 2, 2, 2, 10, 2, 2, 2, 2, 2, 10, 2, 2..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[6551, 9456, 26181, 24231, 11531, 34492, 29501...","[12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[23665, 23072, 24299, 23207, 24118, 21634, 321...","[2, 2, 3, 2, 2, 2, 2, 2, 10, 2, 2, 2, 2, 2, 16..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[33589, 8803, 32143, 31486, 18496, 30076, 1406...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[21788, 13072, 12889, 28634, 28639, 14231, 321...","[10, 2, 2, 11, 0, 2, 3, 2, 10, 2, 12, 2, 12, 2..."


In [22]:
from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2023-09-17 15:46:52.721363: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [40]:
#Function to extract train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags

def get_train_test_val(data_group, datas):

    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    
    ntoken = len(list(set(datas['Word'].to_list())))
    ntag = len(list(set(datas['Tag'].to_list())))
    
    padtokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= ntoken-1)
    
    #Creating Pad Tags (y var) and converting into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    padtags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_to_idx["O"])
    ntags = len(tag_to_idx)
    padtags = [to_categorical(i, num_classes=ntags) for i in padtags]
    
    #Splitting the train, test and validation set
    tokens, testtokens, tags, testtags = train_test_split(padtokens, padtags, test_size=0.1, train_size=0.9, random_state=2020)
    traintokens, valtokens, traintags, valtags = train_test_split(tokens,tags,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'length of train tokens :', len(traintokens),
        '\nlength of train tags   :', len(traintags),
        '\nlength of test tokens  :', len(testtokens),
        '\nlength of test tags    :', len(testtags),
        '\nlength of val tokens   :', len(valtokens),
        '\nlength of val tags     :', len(valtags),
    )
    
    return traintokens, testtokens, valtokens, traintags, testtags, valtags

In [41]:
traintokens, testtokens, valtokens, traintags, testtags, valtags= get_train_test_val(data_group, data)

length of train tokens : 32372 
length of train tags   : 32372 
length of test tokens  : 4796 
length of test tags    : 4796 
length of val tokens   : 10791 
length of val tags     : 10791


In [42]:
#Importing numpy library and tensorflow.keras library for model building.
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [43]:
traintokens

array([[25501, 24937, 28087, ..., 35177, 35177, 35177],
       [33380,  4316,  7381, ..., 35177, 35177, 35177],
       [15310, 24798,  2630, ..., 35177, 35177, 35177],
       ...,
       [29330,  4171,  5292, ..., 35177, 35177, 35177],
       [18328, 33272, 25967, ..., 35177, 35177, 35177],
       [24162, 27553, 28211, ..., 35177, 35177, 35177]], dtype=int32)

In [44]:
traintags

[array([[0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],


In [63]:
#Finding the input and output dimension for Data

input_dim = len(list(set(data['Word'].to_list()))) + 1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])

#Finding the length of tag_to_idx and saving in ntags variable
ntags = len(tag_to_idx)

In [64]:
#Function for the architecture of model.

def get_model():

    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(ntags, activation="relu")))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [65]:
#Function to find the loss of the model
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [70]:
#The Final output will be obtained after the 25 epochs as we set the loop to run 25 times
results = pd.DataFrame()
model = get_model()
results['with_add_lstm'] = train_model(traintokens, np.array(traintags), model)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_6 (Bidirecti  (None, 104, 128)          66048     
 onal)                                                           
                                                                 
 lstm_13 (LSTM)              (None, 104, 64)           49408     
                                                                 
 time_distributed_2 (TimeDi  (None, 104, 17)           1105      
 stributed)                                                      
                                                                 
Total params: 2368017 (9.03 MB)
Trainable params: 2368017 (9.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
You must install

In [71]:
plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [73]:
model.save("model.keras")