In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [None]:
data = pd.read_csv("./ner_dataset.csv", sep='\t')
data = data.fillna(method="ffill")
data.tail(10)

In [None]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

In [None]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sent = getter.get_next()
print(sent)

In [None]:
sentences = getter.sentences


In [None]:
max_len = 50
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
tag2idx["B-PER"]


In [None]:
#To apply the EMLo embedding from tensorflow hub, 
#we have to use strings as input. So we take the tokenized sentences and pad them to the desired length.

X = [[w[0] for w in s] for s in sentences]

In [None]:
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

In [None]:
#This is how a input sample looks like now.

print(X[1])

In [None]:
#And we need to do the same for our tag sequence, but map the string to an integer.

y = [[tag2idx[w[2]] for w in s] for s in sentences]
from keras.preprocessing.sequence import pad_sequences
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y[1]

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, random_state=2018)

In [None]:
batch_size = 24

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

In [None]:
#!pip3 install --upgrade --force-reinstall tensorflow-gpu
sess = tf.Session()
K.set_session(sess)

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [None]:
# Now we create a function that takes a sequence of strings and returns a sequence of 
# 1024-dimensional vectors of the ELMo embedding. 
# We will later use this function with the Lambda layer of keras to get the embedding sequence.

def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [None]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [None]:

input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [None]:
model = Model(input_text, out)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
appr = int(np.floor(0.9*len(X_tr)/batch_size)*batch_size)
ippr = int(np.floor(0.1*len(X_tr)/batch_size)*batch_size)
X_tr, X_val = X_tr[:appr*batch_size], X_tr[-ippr*batch_size:]
y_tr, y_val = y_tr[:appr*batch_size], y_tr[-ippr*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [None]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(hist["accuracy"])
plt.plot(hist["val_accuracy"])
plt.title("Learning curves")
plt.legend()
plt.show()

In [None]:
i = 19
p = model.predict(np.array(X_te[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))