In [31]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data_train = pd.read_csv("/content/gdrive/My Drive/DoDef/task2_train_temp.tsv", quotechar = "~",delimiter = "\t", encoding="utf-8-sig").fillna(method="ffill")
data_test = pd.read_csv("/content/gdrive/My Drive/DoDef/task2_test_temp.tsv",quotechar = "~", delimiter = "\t", encoding="utf-8-sig").fillna(method="ffill")

In [33]:
words_train = list(set(data_train["Word"].values))
words_train.append("ENDPAD")
n_words_train = len(words_train); n_words_train

words_test = list(set(data_test["Word"].values))
words_test.append("ENDPAD")
n_words_test = len(words_test); n_words_test

1493

In [34]:
tags_train = list(set(data_train["Tag"].values))
n_tags_train = len(tags_train); n_tags_train

tags_test = list(set(data_test["Tag"].values))
n_tags_test = len(tags_test); n_tags_test

3

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter_train = SentenceGetter(data_train)
getter_test = SentenceGetter(data_test)

In [0]:
sentences_train = getter_train.sentences
sentences_test = getter_test.sentences

In [0]:
max_len = 50
tag2idx_train = {t: i for i, t in enumerate(tags_train)}
tag2idx_test = {t: i for i, t in enumerate(tags_test)}

In [0]:
X_train = [[w[0] for w in s] for s in sentences_train]
X_test = [[w[0] for w in s] for s in sentences_test]

In [0]:
new_X = []
for seq in X_train:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_train = new_X


new_X = []
for seq in X_test:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_test = new_X

In [0]:
y_train = [[tag2idx_train[w[2]] for w in s] for s in sentences_train]
y_test = [[tag2idx_test[w[2]] for w in s] for s in sentences_test]

In [0]:
from keras.preprocessing.sequence import pad_sequences
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx_train["O"])
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx_test["O"])

In [0]:
X_tr, X_te, y_tr, y_te = X_train,X_test,y_train,y_test

In [0]:
batch_size = 32

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

In [0]:
sess = tf.Session()
K.set_session(sess)

In [0]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [0]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [0]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [50]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags_test, activation="softmax"))(x)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
model = Model(input_text, out)

In [0]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [0]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy",get_f1])

In [0]:
X_tr, X_val = X_tr[:1056], X_tr[-1*batch_size:]
y_tr, y_val = y_tr[:1056], y_tr[-1*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [55]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5, verbose=1)

Train on 1056 samples, validate on 32 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
from sklearn.metrics import f1_score
og = []
predicted = []
for i in range(201):
  p = model.predict(np.array(X_te[i:i+batch_size]))[0]
  p = np.argmax(p, axis=-1)
  # print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
  # print("="*30)
  for w, true, pred in zip(X_te[i], y_te[i], p):
      if w != "__PAD__":
          og.append(tags_test[true])
          predicted.append(tags_test[pred])
          # print("{:15}:{:5} ({})".format(w, tags_test[pred], tags_test[true]))
  # print (i)
print ("macro : " + str(f1_score(og, predicted, average='macro')))
print ("micro : " + str(f1_score(og, predicted, average='micro')))
print ("weighted : " + str(f1_score(og, predicted, average='weighted')))
print ("None : " + str(f1_score(og, predicted, average=None)))


macro : 0.6136762744421499
micro : 0.8955641688199828
weighted : 0.877362116455116
None : [0.58661417 0.30909091 0.94532374]


In [58]:
from sklearn.metrics import classification_report
print(classification_report(og, predicted, target_names=["B-ADR","I-ADR","O"]))

              precision    recall  f1-score   support

       B-ADR       0.69      0.51      0.59       291
       I-ADR       0.63      0.20      0.31       332
           O       0.91      0.98      0.95      4021

    accuracy                           0.90      4644
   macro avg       0.74      0.57      0.61      4644
weighted avg       0.88      0.90      0.88      4644



In [59]:
for i in range(len(og)):
  if og[i] == "B-ADR" or og[i] == "I-ADR":
    og[i] = "ADR"

for i in range(len(predicted)):
  if predicted[i] == "B-ADR" or predicted[i] == "I-ADR":
    predicted[i] = "ADR"
  
print(classification_report(og, predicted, target_names=["ADR","O"]))

              precision    recall  f1-score   support

         ADR       0.76      0.39      0.52       623
           O       0.91      0.98      0.95      4021

    accuracy                           0.90      4644
   macro avg       0.83      0.69      0.73      4644
weighted avg       0.89      0.90      0.89      4644

