<a href="https://colab.research.google.com/github/dilanbakr/namedEntityRecognition/blob/main/Ner_En_DeepLearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras==2.2.4



In [None]:
! pip -q install git+https://www.github.com/keras-team/keras-contrib.git sklearn-crfsuite

[?25l[K     |▍                               | 10kB 27.9MB/s eta 0:00:01[K     |▉                               | 20kB 6.4MB/s eta 0:00:01[K     |█▎                              | 30kB 9.1MB/s eta 0:00:01[K     |█▊                              | 40kB 11.4MB/s eta 0:00:01[K     |██▏                             | 51kB 13.5MB/s eta 0:00:01[K     |██▋                             | 61kB 15.6MB/s eta 0:00:01[K     |███                             | 71kB 17.1MB/s eta 0:00:01[K     |███▌                            | 81kB 11.2MB/s eta 0:00:01[K     |████                            | 92kB 12.4MB/s eta 0:00:01[K     |████▍                           | 102kB 13.3MB/s eta 0:00:01[K     |████▉                           | 112kB 13.3MB/s eta 0:00:01[K     |█████▏                          | 122kB 13.3MB/s eta 0:00:01[K     |█████▋                          | 133kB 13.3MB/s eta 0:00:01[K     |██████                          | 143kB 13.3MB/s eta 0:00:01[K     |██████▌        

In [None]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [None]:
data = pd.read_csv('ENDataset.csv', encoding = "latin1")
data = data.fillna(method="ffill")

print("Number of sentences: ", len(data.groupby(['Sentence #'])))

words = list(set(data["Word"].values))
n_words = len(words)
print("Number of words in the dataset: ", n_words)

tags = list(set(data["Tag"].values))
print("Tags:", tags)
n_tags = len(tags)
print("Number of Labels: ", n_tags)

print("What the dataset looks like:")

data.head(10)

Number of sentences:  114
Number of words in the dataset:  1061
Tags: ['O', 'I-LOC', 'I-MISC', '-', 'O ', 'I-ORG', 'I-PER', '...']
Number of Labels:  8
What the dataset looks like:


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Subordinated,NNP,O
1,Sentence: 1,Loan,NNP,O
2,Sentence: 1,Agreement,NNP,O
3,Sentence: 1,-,:,O
4,Sentence: 1,Silicium,NNP,I-ORG
5,Sentence: 1,de,IN,I-ORG
6,Sentence: 1,Provence,NNP,I-ORG
7,Sentence: 1,SAS,NNP,I-ORG
8,Sentence: 1,and,CC,O
9,Sentence: 1,Evergreen,NNP,I-ORG


In [None]:
class SentenceGetter(object):
    def __init__(self, data):
       
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)
sent = getter.get_next()
sentences = getter.sentences

In [None]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

In [None]:
idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
BATCH_SIZE = 50 
MAX_LEN = 30  
EMBEDDING = 20 

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])


y = [[tag2idx[w[2]] for w in s] for s in sentences]

y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])

from keras.utils import to_categorical

y = [to_categorical(i, num_classes=n_tags+1) for i in y]  

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

print('Raw Sample: ', ' '.join([w[0] for w in sentences[0]]))
print('Raw Label: ', ' '.join([w[2] for w in sentences[0]]))

Raw Sample:  Subordinated Loan Agreement - Silicium de Provence SAS and Evergreen Solar Inc . 7 - December 2007 [ HERBERT SMITH LOGO ] ...
Raw Label:  O O O O I-ORG I-ORG I-ORG I-ORG O I-ORG I-ORG I-ORG O O O O O O I-PER I-PER O O ...


In [None]:
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

((102, 30), (12, 30), (102, 30, 9), (12, 30, 9))

In [None]:
print('Raw Sample: ', ' '.join([w[0] for w in sentences[0]]))
print('Raw Label: ', ' '.join([w[2] for w in sentences[0]]))
print('After processing, sample:', X[0])
print('After processing, labels:', y[0])

Raw Sample:  Subordinated Loan Agreement - Silicium de Provence SAS and Evergreen Solar Inc . 7 - December 2007 [ HERBERT SMITH LOGO ] ...
Raw Label:  O O O O I-ORG I-ORG I-ORG I-ORG O I-ORG I-ORG I-ORG O O O O O O I-PER I-PER O O ...
After processing, sample: [ 247  944  710  705  407  679  326  506  133  566  124  109  219   40
  705  929  192  518 1001  162  546  147  626    0    0    0    0    0
    0    0]
After processing, labels: [[0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF


input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, 
                  input_length=MAX_LEN, mask_zero=True)(input)  
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.2))(model)  
model = Dense(200, activation="relu")(model)  
crf = CRF(n_tags+1)  
out = crf(model)  

model = Model(input, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 20)            21260     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 100)           28400     
_________________________________________________________________
dense_1 (Dense)              (None, 30, 200)           20200     
_________________________________________________________________
crf_1 (CRF)                  (None, 30, 9)             1908      
Total params: 71,768
Trainable params: 71,768
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=100, verbose=1)




Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


In [None]:
#X_tr, X_te, y_tr, y_te
y_pred = model.predict(X_te)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_te, -1)

In [None]:
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]

In [None]:
from sklearn.metrics import f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)

              precision    recall  f1-score   support

       I-PER       1.00      1.00      1.00         4
           O       1.00      1.00      1.00       303
         PAD       1.00      1.00      1.00        53

    accuracy                           1.00       360
   macro avg       1.00      1.00      1.00       360
weighted avg       1.00      1.00      1.00       360



In [None]:
i = np.random.randint(0,X_te.shape[0]) 
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_te.shape[0]))
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

Sample number 1 of 12 (Test Set)
Word           ||True ||Pred
agreement      : O     O
contained      : O     O
in             : O     O
this           : O     O
Article        : O     O
shall          : O     O
not            : O     O
apply          : O     O
to             : O     O
liabilities    : O     O
which          : O     O
the            : O     O
Lender         : I-PER I-PER
may            : O     O
directly       : O     O
or             : O     O
indirectly     : O     O
suffer         : O     O
or             : O     O
by             : O     O
reason         : O     O
of             : O     O
the            : O     O
Lender         : I-PER I-PER
'              : O     O
s              : O     O
own            : O     O
negligence     : O     O
or             : O     O
misconduct     : O     O
