## Drug NER with LSTM and CRF

In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import re

from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import Dense, Activation, Dropout, Embedding, Bidirectional
from keras.layers import LSTM
from keras.layers import TimeDistributed, Input
from keras.layers import Conv1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint
import gensim.models.word2vec as w2v
import h5py

from tqdm import tqdm_notebook as tqdm
from keras.utils import np_utils
from keras_contrib.layers import CRF
import matplotlib.pyplot as plt

import os
import csv
import sys
import gensim
import random
import pickle

In [None]:
def __read_csv(file: str) -> tuple:
    csv.field_size_limit(sys.maxsize)
    sentences, labels = list(), list()
    with open(file) as f:
        reader = csv.reader(f, delimiter='|', quotechar='"')
        for item in reader:
            sentences.append(eval(item[0]))
            labels.append(eval(item[1]))
    return (sentences, labels, len(sentences))

In [None]:
def read_input_files(train_file: str ='./train_val_seg.csv',
                     test_file: str ='./test_seg.csv') -> tuple:
    sentences, labels, train_num = __read_csv(train_file)
    s, l, test_num  = __read_csv(test_file)
    sentences.extend(s)
    labels.extend(l)
    return (sentences, labels, train_num, test_num)

In [None]:
EMBEDDING_DIM = 120

def fasttext_emb(sentences, modelpath, force=False):
    if force == False and os.path.exists(modelpath) == True:
        print ("Word embedding model exists, skipping model %s." % modelpath)
        return gensim.models.Word2Vec.load(modelpath)
    
    print ("Generating word embedding model %s" % modelpath)
    model = gensim.models.FastText(sentences, size=EMBEDDING_DIM, window=5, min_count=1,
                                   workers=4,sg=1, word_ngrams=5, iter=10, seed=23)
    model.save(modelpath)
    return model

In [None]:
sentences, labels, train_num, test_num = read_input_files()
model = fasttext_emb(sentences, './emb/emb.bin')
word_vectors = model.wv

In [None]:
df = pd.DataFrame(index=range(len(sentences)))
df['sentences'] = sentences
df['labels'] = labels


In [None]:
print("Finding proper maxlen:")
print("max:", max(df['sentences'].apply(len)))
print("99-percentile:", df['sentences'].apply(len).quantile(0.99, interpolation='lower'))
print("95-percentile:", df['sentences'].apply(len).quantile(0.95, interpolation='lower'))
print("90-percentile:", df['sentences'].apply(len).quantile(0.90, interpolation='lower'))


In [None]:
maxlen = df['sentences'].apply(len).quantile(0.99, interpolation='lower')

def truncate_to_maxlen(l: list) -> list:
    return l[:maxlen]

df['sentences'] = df['sentences'].apply(truncate_to_maxlen)
df['labels'] = df['labels'].apply(truncate_to_maxlen)
assert max(df['sentences'].apply(len)) == max(df['labels'].apply(len)) == maxlen, \
        "Error: Length mismatch after truncating!"
print("After truncating, new maxlen is:", max(df['sentences'].apply(len)))
df.index = range(len(df))

In [None]:
# assign an index to each unique word in the corpus
if not os.path.exists('sci_ner_drug_model.h5'):
    words_index = []
    for i in sentences:
        words_index.extend(i)
    # value_counts() returns a Pandas.Series of unique words and their frequencies
    words_index = pd.Series(words_index).value_counts()
    # change the frequencies to range(), so that each word corresponds
    # to an index number
    words_index[:] = range(1, len(words_index) + 1)
    # index 0 is reserved
    words_index[''] = 0
    pickle.dump(words_index, open("./emb/words_index.pkl", "wb" ))
else:
    words_index = pickle.load(open("./emb/words_index.pkl", "rb"))

embedding_matrix = np.zeros((len(words_index)+1, EMBEDDING_DIM))
for word, i in words_index.items():
    try:
        if word in word_vectors:
            # words not found in embedding index will be all-zeros.
            # This shouldn't happen because min_count is set to 1 during WE training
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
    except:
        pass
        # print(word)

tag = pd.Series({'B':0, 'I':1, 'O':2, 'X':3})
df['x'] = df['sentences'].apply(lambda s: np.array(list(words_index[s]) + [0] * (maxlen - len(s))))

idx = list(range(len(df)))
np.random.shuffle(idx)
df = df.loc[idx]

def trans_one(labels):
    tmpLabels = map(lambda s: np_utils.to_categorical(s, 4), tag[labels].values.reshape((-1, 1)))
    tmpLabels = list(tmpLabels)
    tmpLabels.extend([np.array([[0, 0, 0, 1]])] * (maxlen - len(labels)))
    return np.array(tmpLabels)
df['y'] = df['labels'].apply(trans_one)

In [None]:
print(len(df['sentences']))

In [None]:
len(sentences)

In [None]:
print(embedding_matrix.shape)

### BiLSTM model with word-level info and CRF 

In [None]:
#BiLSTM + CRF
sequence = Input(shape=(maxlen,),dtype='int32')
embedded = Embedding(len(words_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], 
                     input_length=maxlen, trainable=False, mask_zero=True)(sequence)
blstm = Bidirectional(LSTM(32, return_sequences=True))(embedded)
dropout = Dropout(0.1)(blstm)
dense = TimeDistributed(Dense(32, activation='relu'))(dropout)

crf = CRF(4)
crf_output = crf(dense)

model = Model(inputs=sequence, outputs=crf_output)

model.compile(loss=crf.loss_function,
              optimizer='adam',
              metrics=[crf.accuracy])
print(model.summary())

batch_size = 64

#generate input data
x = np.array(list(df['x']))
y = np.array(list(df['y']))
#adjust the shape of labels
y = y.reshape((-1, maxlen, 4))

x_train, y_train = x[:train_num], y[:train_num]
x_test, y_test = x[train_num:], y[train_num:]

if os.path.exists('sci_ner_drug_model.h5'):
    print("Pretrained weights found, skipping training.")
    print("To retrain the model, delete the 'sci_ner_drug_model.h5' file in the current directory.")
    model.load_weights('sci_ner_drug_model.h5')
else:
    filepath="sci_ner_drug_model_best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=64, callbacks=callbacks_list,
                        # validation_data=(x_validation, y_validation)
                        validation_split=0.4)
    model.save('sci_ner_drug_model.h5')
print("Evaluation on the withheld test set:")
acc = model.evaluate(x_test, y_test, batch_size = batch_size)
print(acc)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['crf_viterbi_accuracy'])
plt.plot(history.history['val_crf_viterbi_accuracy'])
plt.title('Model CRF accuracy')
plt.ylabel('CRF Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['crf_viterbi_accuracy'])
plt.plot(history.history['val_crf_viterbi_accuracy'])
plt.title('Model CRF accuracy')
plt.ylabel('CRF Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()