In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("./datasets/ner_dataset.csv", encoding = "ISO-8859-1")
data.shape
data

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [3]:
data = data.drop(['POS'], axis =1)
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O


In [4]:
def GetNextSentenceAndTags(data):
    agg_words = lambda s: s["Word"].values.tolist()
    agg_tags = lambda s: s["Tag"].values.tolist()
    grouped_words = data.groupby("Sentence #").apply(agg_words)
    grouped_tags = data.groupby("Sentence #").apply(agg_tags)
    return grouped_words, grouped_tags

group = GetNextSentenceAndTags(data)
group[0].to_list()

[['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 ['Iranian',
  'officials',
  'say',
  'they',
  'expect',
  'to',
  'get',
  'access',
  'to',
  'sealed',
  'sensitive',
  'parts',
  'of',
  'the',
  'plant',
  'Wednesday',
  ',',
  'after',
  'an',
  'IAEA',
  'surveillance',
  'system',
  'begins',
  'functioning',
  '.'],
 ['Helicopter',
  'gunships',
  'Saturday',
  'pounded',
  'militant',
  'hideouts',
  'in',
  'the',
  'Orakzai',
  'tribal',
  'region',
  ',',
  'where',
  'many',
  'Taliban',
  'militants',
  'are',
  'believed',
  'to',
  'have',
  'fled',
  'to',
  'avoid',
  'an',
  'earlier',
  'military',
  'offensive',
  'in',
  'nearby',
  'South',
  'Waziristan',
  '.'],
 ['They',
  'left',
  'after',
  'a',
  'tense',
  'hour-long',
  'standoff',
  'wi

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

words = data.Word.unique()
vocabulary =  {}
index = 1
for i in range(words.shape[0]):
    word = words[i].lower()
    if word not in vocabulary.keys():
        vocabulary[word] = index
        index += 1
vocabulary["<PAD>"] = 0
classes = data.Tag.unique()
classes = dict([(classes[i].lower(),i)for i in range(classes.shape[0])])

def ProcessWordsAsNumber(group, groupIndex, mapping):
    for i in range(len(group[groupIndex])):
        for j in range(len(group[groupIndex][i])):
            if group[groupIndex][i][j].lower() not in mapping.keys():
                group[groupIndex][i][j] = mapping["<PAD>"]
                continue
            group[groupIndex][i][j] = mapping[group[groupIndex][i][j].lower()]
    return group

ProcessWordsAsNumber(group,0,vocabulary)
ProcessWordsAsNumber(group,1,classes)

(Sentence #
 Sentence: 1        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
 Sentence: 10       [122, 123, 124, 43, 125, 8, 126, 127, 8, 128, ...
 Sentence: 100      [903, 904, 350, 905, 219, 906, 12, 10, 907, 24...
 Sentence: 1000     [43, 813, 130, 46, 4358, 4359, 4360, 32, 1784,...
 Sentence: 10000    [151, 803, 1133, 7067, 8742, 168, 338, 91, 146...
                                          ...                        
 Sentence: 9995     [343, 521, 8361, 9434, 9435, 148, 168, 485, 10...
 Sentence: 9996     [62, 422, 91, 122, 208, 1134, 2433, 46, 2308, ...
 Sentence: 9997     [1224, 108, 67, 7331, 1197, 4382, 349, 91, 170...
 Sentence: 9998     [506, 3018, 91, 838, 4, 778, 401, 2716, 2, 10,...
 Sentence: 9999     [10, 561, 786, 76, 7681, 10, 1765, 2, 298, 795...
 Length: 47959, dtype: object,
 Sentence #
 Sentence: 1        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...
 Sentence: 10       [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
 Sentence: 100      [0, 0, 7, 0, 0,

In [6]:
x = pad_sequences(maxlen=50, sequences=group[0], padding="post", value=vocabulary["<PAD>"])
y = pad_sequences(maxlen=50, sequences=group[1], padding="post", value=classes["O".lower()])
x,y

(array([[   1,    2,    3, ...,    0,    0,    0],
        [ 122,  123,  124, ...,    0,    0,    0],
        [ 903,  904,  350, ...,    0,    0,    0],
        ...,
        [1224,  108,   67, ...,    0,    0,    0],
        [ 506, 3018,   91, ...,    0,    0,    0],
        [  10,  561,  786, ...,    0,    0,    0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [2, 0, 0, ..., 0, 0, 0],
        [0, 0, 7, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 5, 6, ..., 0, 0, 0]]))

In [21]:
from tensorflow import keras
import tensorflow_hub as hub
elmo_model = hub.load("https://tfhub.dev/google/elmo/3")
batch_size = 128
max_len= 50
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x, tf.string)),"sequence_len": tf.constant(batch_size*[max_len])},signature="tokens",
                      as_dict=True)["elmo"]

model = keras.Sequential([
            keras.layers.Lambda(ElmoEmbedding, output_shape=(max_len,1024)),
            keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
            keras.layers.Dense(64, activation="relu"),
            keras.layers.Dense(len(classes), activation="softmax"),
])

model.build(input_shape=(max_len,),)
model.summary()

model.compile(optimizer='adam',
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                           metrics=['accuracy'])
history = model.fit(
            x=x,
            y=y,
            epochs=10,
            batch_size=batch_size,
            verbose=True,
            validation_split=.2
        )
print(history.history['accuracy'][-1])

TypeError: build() got an unexpected keyword argument 'dtype'

In [None]:
x.shape, y.shape

In [None]:
len(vocabulary)

In [None]:
def ProcessWordsAsNumber(group, groupIndex, mapping):
    for i in range(len(group[groupIndex])):
        for j in range(len(group[groupIndex][i])):
            if group[groupIndex][i][j].lower() not in mapping.keys():
                group[groupIndex][i][j] = mapping["<PAD>"]
                continue
            group[groupIndex][i][j] = mapping[group[groupIndex][i][j].lower()]
    return group

phrase = "Hello my name is Clóvis from Chatbot Maker. My name is Thompson, how are you ?"
phrase = [word.lower() for word in phrase.split()]
phrase = ProcessWordsAsNumber([[phrase]],0,vocabulary)[0]
phrase = pad_sequences(maxlen=50, sequences=phrase, padding="post", value=vocabulary["<PAD>"])
np.argmax(model.predict(phrase), axis=2)
model.predict(phrase)

In [None]:
classes