In [161]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tabulate
import time

In [2]:
df = pd.read_csv("ner.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47959 entries, 0 to 47958
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  47959 non-null  object
 1   Sentence    47959 non-null  object
 2   POS         47959 non-null  object
 3   Tag         47959 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


In [4]:
df.head(10)

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
5,Sentence: 6,The party is divided over Britain 's participa...,"['DT', 'NN', 'VBZ', 'VBN', 'IN', 'NNP', 'POS',...","['O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', '..."
6,Sentence: 7,The London march came ahead of anti-war protes...,"['DT', 'NNP', 'NN', 'VBD', 'RB', 'IN', 'JJ', '...","['O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', '..."
7,Sentence: 8,The International Atomic Energy Agency is to h...,"['DT', 'NNP', 'NNP', 'NNP', 'NNP', 'VBZ', 'TO'...","['O', 'B-org', 'I-org', 'I-org', 'I-org', 'O',..."
8,Sentence: 9,Iran this week restarted parts of the conversi...,"['NNP', 'DT', 'NN', 'VBD', 'NNS', 'IN', 'DT', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '..."
9,Sentence: 10,Iranian officials say they expect to get acces...,"['JJ', 'NNS', 'VBP', 'PRP', 'VBP', 'TO', 'VB',...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '..."


In [5]:
df.isna().sum()

Sentence #    0
Sentence      0
POS           0
Tag           0
dtype: int64

In [6]:
sentences = df['Sentence'].to_numpy()
tags = df['Tag'].apply(lambda x: x[1:-1].replace("'", "").replace(",", "")).to_numpy()
pos = df['POS'].apply(lambda x: x[1:-1].replace("'", "").replace(",", "")).to_numpy()

In [7]:
df["word_count"] = df["Sentence"].str.split().str.len()
print(f"Max Sentence length: {df['word_count'].max()}")
df.drop(columns=["word_count"], inplace=True)

Max Sentence length: 104


In [8]:
print(f"Total Number of Words/POS tags/IOB tags : {len(sentences)}")

Total Number of Words/POS tags/IOB tags : 47959


## Doing 80/20 Test Train Split

In [9]:
senteces_train = sentences[:38366]
tags_train = tags[:38366]
pos_train = pos[:38366]

senteces_test = sentences[38366:]
tags_test = tags[38366:]
pos_test = pos[38366:]

In [10]:
MAX_LEN = 110

## Getting Vector Representation

In [11]:
sentence_vectorizer = tf.keras.layers.TextVectorization(
        standardize=None,
        output_mode = 'int',
        output_sequence_length = MAX_LEN)
sentence_vectorizer.adapt(senteces_train)
vocab = sentence_vectorizer.get_vocabulary()

IOB_vectorizer = tf.keras.layers.TextVectorization(
        standardize=None,
        output_mode = 'int',
        output_sequence_length = MAX_LEN)
IOB_vectorizer.adapt(tags_train)
IOB_ids = IOB_vectorizer.get_vocabulary()


In [12]:
print(f"Number of unique IOB TAGS in the training: {len(IOB_ids)}\n")

print(f"POS tags: {tags_train[0]}\n")

print(f"POS TAGs vectorized: {IOB_vectorizer(tags_train[0])}")

Number of unique IOB TAGS in the training: 19

POS tags: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

POS TAGs vectorized: [2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 9 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [13]:
print(f"Number of unique words in the training vocab: {len(vocab)}\n")

print(f"Sentence: {senteces_train[0]}\n")

print(f"Sentence vectorized: {sentence_vectorizer(senteces_train[0])}")

Number of unique words in the training vocab: 31817

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

Sentence vectorized: [1070    6 1106   17 1769  229  498    7  528    2  156    5   61    9
  631    2  940    6  189   89   23   16   52    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [14]:
sentence_vec = sentence_vectorizer(senteces_train)
IOB_vec = IOB_vectorizer(tags_train)

In [15]:
print(f"Shape of every converted vector: {sentence_vec.shape} | {IOB_vec.shape}")

Shape of every converted vector: (38366, 110) | (38366, 110)


In [82]:
tags = []
for i in range(len(IOB_ids)):
    tags.append(str(IOB_ids[i]))
    

### Embedding the vector representation

In [18]:
#setting the size for each to the length of the vocabulary from the vectorization
VOCAB_SIZE = len(vocab)
IOB_SIZE = len(IOB_ids)

In [19]:
# choosing a embedding dim for creating a embedidng matrix.
EMB_DIM = 100
EMB_DIM_POS = 32

In [20]:
sentence_input = tf.keras.layers.Input(shape=(MAX_LEN,), dtype="int32")
#pos_input = tf.keras.layers.Input(shape=(MAX_LEN,), dtype="int32", name="pos_input")

In [193]:
sentence_input.shape

(None, 110)

In [195]:
sentence_emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMB_DIM, mask_zero=True)(sentence_input)
sentence_emb.shape

(None, 110, 100)

In [204]:
def get_embeddings():
    sentence_emb = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMB_DIM, mask_zero=True)(sentence_input)
    positional_encoder  = tf.keras.layers.Embedding(input_dim=MAX_LEN, output_dim=EMB_DIM)

    #----------------------------------Static Positional Encoding----------------------------------#
    #def get_positional_encoding(MAX_LEN, EMB_DIM):
    #    pos = np.arange(MAX_LEN)[:, np.newaxis]
    #    i = np.arange(EMB_DIM)[np.newaxis, :]
     #   angle_rates = 1 / np.power(10000.0, (2 * (i // 2)) / np.float32(EMB_DIM))
      #  angle_rads = pos * angle_rates
      #  sines = np.sin(angle_rads[:, 0::2])
       # coses = np.cos(angle_rads[:, 1::2])
        #pos_encoding = np.concatenate([sines, coses], axis=-1)
        #return tf.cast(pos_encoding, dtype=tf.float32)

    # ------------------------------------------------------------------------------------------------#

    #-----making use of learnable positional encoding-----------------------------------------------#
    
    position_ = tf.range(MAX_LEN)
    position_emb = positional_encoder(position_)
    #position_emb = get_positional_encoding(MAX_LEN=MAX_LEN, EMB_DIM=EMB_DIM)
    position_emb = tf.expand_dims(position_emb,0)
    position_emb = tf.keras.layers.Lambda(
        lambda inputs: tf.tile(
            inputs[0],                         
            [tf.shape(inputs[1])[0], 1, 1]     
        )
    )([position_emb, sentence_input])   
    embeddings = tf.keras.layers.Add()([sentence_emb, position_emb])    
    return embeddings

In [22]:
#merged_emb = tf.keras.layers.concatenate([sentence_emb, pos_tags_emb])
#masked_emb = tf.keras.layers.Masking(mask_value=0)(merged_emb)

In [23]:
#merged_emb.shape

In [24]:
#masked_emb.shape

# Model Building

### 1D CNN As BASE MODEL
### RRN and MLP as Comparison Model

In [205]:
def build_cnn_model():
    emb = get_embeddings()
    layer_out = tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding="same", activation="relu")(emb)
    layer_out = tf.keras.layers.Dropout(0.3)(layer_out)
    layer_out = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding="same", activation="relu")(layer_out)
    layer_out = tf.keras.layers.Dropout(0.3)(layer_out)
    layer_out = tf.keras.layers.Dense(units=128, activation="relu")(layer_out)
    output = tf.keras.layers.Dense(IOB_SIZE, activation="softmax")(layer_out)
    model = tf.keras.Model(inputs=[sentence_input], outputs=output)
    print(model.summary())
    return model

In [206]:
def build_simple_rnn_model():
    emb = get_embeddings()
    rnn_out = tf.keras.layers.SimpleRNN(100, return_sequences=True)(emb)
    output = tf.keras.layers.Dense(IOB_SIZE, activation='softmax')(rnn_out)
    model = tf.keras.Model(inputs=[sentence_input], outputs=output)
    print(model.summary())
    return model

In [207]:
def build_mlp_model():
    x = get_embeddings()
    mlp_hidden = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(64, activation='relu'))(x)
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(IOB_SIZE, activation='softmax'))(mlp_hidden)
    model = tf.keras.Model(inputs=[sentence_input], outputs=output)
    print(model.summary())
    return model

In [208]:
def get_model_metrics(test, tags_to_predict, model):
    test = sentence_vectorizer(test)
    tags = IOB_vectorizer(tags_to_predict)
    loss, acc = model.evaluate(x=test, y=tags, verbose=2)
    return loss, acc
    
    
def get_real_prediction(sentence, model):
    predict_sentences = sentence_vectorizer([sentence])
    predictions_prob = model.predict(predict_sentences)
    prediction = np.argmax(predictions_prob,axis=-1)
    prediction = prediction[0]
    prediction = prediction[predict_sentences[0] != 0]
    prediction = [str(IOB_ids[idx]) for idx in prediction]
    sentence = sentence.split()
    table = [[word, tag] for word, tag in zip(sentence, prediction)]
    return prediction

    
def viz_result(sentence,prediction):
    sentence = sentence.split()
    print(f"Sentence : {sentence}\n")
    print(f"Tags : {' '.join(prediction)}\n")
    table = [[word, tag] for word, tag in zip(sentence, prediction)]
    print(tabulate.tabulate(table, headers=['Word', 'Tag'], tablefmt='rounded_outline'))

## Model Training

In [220]:
def compile_and_build_cnn():
    model_1 = build_cnn_model()
    model_1.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    start = time.time()
    cnn_history = model_1.fit(x=sentence_vec,y= IOB_vec, epochs=2, batch_size=64)
    end = time.time()
    total = end - start
    print("\n")
    print(f"Total Time to Train: {total:.2f} seconds")
    print(f"Saving Model!")
    model_1.save_weights('cnn_model.weights.h5')
    return model_1,cnn_history

In [227]:
def compile_and_build_rnn():
    model_2 = build_simple_rnn_model()
    model_2.compile(optimizer=tf.keras.optimizers.Adam(0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    start = time.time()
    rnn_history = model_2.fit(x=sentence_vec,y= IOB_vec, epochs=2, batch_size=64)
    end = time.time()
    total = end - start
    
    print("\n")
    print(f"\nTotal Time to Train: {total:.2f} seconds")
    print(f"Saving Model ! ")
    model_2.save_weights('rnn_model.weights.h5')
    return model_2,rnn_history

In [228]:
def compile_and_build_mlp():
    model_3 = build_mlp_model()
    model_3.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    start = time.time()
    mlp_history = model_3.fit(x=sentence_vec,y=IOB_vec,epochs=2,batch_size=64)
    end = time.time()
    total = end - start
    print("\n")
    print(f"(Total Time to Train: {total:.2f} seconds")
    print(f"Saving Model ! ")
    model_3.save_weights('mlp_model.weights.h5')
    return model_3,mlp_history

In [229]:
model_1,cnn_history = compile_and_build_cnn()

None
Epoch 1/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 66ms/step - accuracy: 0.9368 - loss: 0.3054
Epoch 2/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 66ms/step - accuracy: 0.9929 - loss: 0.0254


Total Time to Train: 83.13 seconds
Saving Model!


In [230]:
model_2,rnn_history = compile_and_build_rnn()

None
Epoch 1/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 70ms/step - accuracy: 0.9488 - loss: 0.2376
Epoch 2/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 69ms/step - accuracy: 0.9924 - loss: 0.0279



Total Time to Train: 87.37 seconds
Saving Model ! 


In [231]:
model_3,mlp_history = compile_and_build_mlp()

None
Epoch 1/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 106ms/step - accuracy: 0.9564 - loss: 0.3484
Epoch 2/2
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 102ms/step - accuracy: 0.9901 - loss: 0.0360


(Total Time to Train: 163.83 seconds
Saving Model ! 


## Model Performance on a given Sentence

#### Loading the Model Weights after saving. 

In [232]:
#loading the models saved weights
cnn_ = build_cnn_model()
cnn_.load_weights("cnn_model.weights.h5")

rnn_ = build_simple_rnn_model()
rnn_.load_weights("rnn_model.weights.h5")

mlp_ = build_mlp_model()
mlp_.load_weights("mlp_model.weights.h5")


None


None


None


In [234]:
r_sentence = 'Bob bought a ticket to NewYork City yesterday.'

pred_1 = get_real_prediction(sentence=r_sentence, model=cnn_)
pred_2 = get_real_prediction(sentence=r_sentence, model=rnn_)
pred_3 = get_real_prediction(sentence=r_sentence, model=mlp_)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step


In [235]:
print(pred_1)
print(pred_2)
print(pred_3)

['B-per', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O']
['B-per', 'O', 'O', 'O', 'O', 'O', 'I-geo', 'O']
['B-per', 'O', 'O', 'O', 'O', 'O', 'I-geo', 'O']


In [236]:
viz_result(sentence=r_sentence, prediction=pred_1)

Sentence : ['Bob', 'bought', 'a', 'ticket', 'to', 'NewYork', 'City', 'yesterday.']

Tags : B-per O O O O B-geo I-geo O

╭────────────┬───────╮
│ Word       │ Tag   │
├────────────┼───────┤
│ Bob        │ B-per │
│ bought     │ O     │
│ a          │ O     │
│ ticket     │ O     │
│ to         │ O     │
│ NewYork    │ B-geo │
│ City       │ I-geo │
│ yesterday. │ O     │
╰────────────┴───────╯


In [237]:
viz_result(sentence=r_sentence, prediction=pred_2)

Sentence : ['Bob', 'bought', 'a', 'ticket', 'to', 'NewYork', 'City', 'yesterday.']

Tags : B-per O O O O O I-geo O

╭────────────┬───────╮
│ Word       │ Tag   │
├────────────┼───────┤
│ Bob        │ B-per │
│ bought     │ O     │
│ a          │ O     │
│ ticket     │ O     │
│ to         │ O     │
│ NewYork    │ O     │
│ City       │ I-geo │
│ yesterday. │ O     │
╰────────────┴───────╯


In [238]:
viz_result(sentence=r_sentence, prediction=pred_3)

Sentence : ['Bob', 'bought', 'a', 'ticket', 'to', 'NewYork', 'City', 'yesterday.']

Tags : B-per O O O O O I-geo O

╭────────────┬───────╮
│ Word       │ Tag   │
├────────────┼───────┤
│ Bob        │ B-per │
│ bought     │ O     │
│ a          │ O     │
│ ticket     │ O     │
│ to         │ O     │
│ NewYork    │ O     │
│ City       │ I-geo │
│ yesterday. │ O     │
╰────────────┴───────╯


## Model Metrics on Test Set

In [240]:
get_model_metrics(test=senteces_test, tags_to_predict=tags_test, model=model_1)

300/300 - 4s - 14ms/step - accuracy: 0.9934 - loss: 0.0219


(0.021892352029681206, 0.9934338331222534)

In [241]:
get_model_metrics(test=senteces_test, tags_to_predict=tags_test, model=model_2)

300/300 - 6s - 20ms/step - accuracy: 0.9921 - loss: 0.0261


(0.026128478348255157, 0.9921217560768127)

In [242]:
get_model_metrics(test=senteces_test, tags_to_predict=tags_test, model=model_3)

300/300 - 9s - 29ms/step - accuracy: 0.9894 - loss: 0.0383


(0.038306012749671936, 0.9894407391548157)