# Bi-LSTM-CRF Model

In [4]:
from google.colab import files
uploaded = files.upload()

Saving prepro.py to prepro.py


In [2]:
!sudo pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install sklearn_crfsuite
!pip install keras==2.2.4

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-mmvybom3
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-mmvybom3
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp36-none-any.whl size=101065 sha256=31b4613892f65e8ff787968626efb00f558ee02e8a3c1519083ae0df847d8017
  Stored in directory: /tmp/pip-ephem-wheel-cache-125ie3s2/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8
Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Co

In [0]:
import pandas as pd
import re
import numpy as np
import urllib.request
from sklearn.model_selection import train_test_split

#cusotm packages
from prepro import readstring

#keras and tensorflow packages
from keras.layers.merge import add
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras_contrib.metrics import crf_accuracy


from sklearn_crfsuite.metrics import flat_classification_report,flat_f1_score,flat_precision_score

In [0]:
#import data from my github repo
train_url = "https://raw.githubusercontent.com/deanhoperobertson/Named-Enitty-Recognition/master/Data/train.txt"
test_url = "https://raw.githubusercontent.com/deanhoperobertson/Named-Enitty-Recognition/master/Data/test.txt"
train = urllib.request.urlopen(train_url).read()
test = urllib.request.urlopen(test_url).read()
train = train.decode('utf-8')
test = test.decode('utf-8')

def readstring(filename, meth):
    f = filename.split('\n')
    sentences = []
    sentence = []
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        if meth.lower()=="numbers":
            sentence.append([hasNumbers(splits[0]), splits[-1].strip()])
        else:
            sentence.append([splits[0], splits[-1].strip()])
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    return sentences

def hasNumbers(inputString):
    if re.search(r'\d', inputString):
        return "__"
    else:return(inputString)

#preproces the txt file
train_data = readstring(train,"numbers")
test_data = readstring(test,"numbers")

#create corpus
corpus = train_data.copy()
corpus.extend(test_data)

In [0]:
def reformat_data(data,meth):
  if meth.lower() == "data":
    i=0
  else: i=1
  train = []
  output= []
  for sentence in data:
    words=[]
    for x in sentence:
      words.append(x[i])
    train.append(words)

  for i in train:
    string = ' '.join(i)
    output.append(string)
  return output

def get_max_length(corpus):
  length = []
  for sentence in corpus:
    length.append(len(sentence))
  return int(max(length))

def number_of_tags(corpus):
  tags=[]
  for sentence in corpus:
    for tag in sentence:
      tags.append(tag[1])
  return int(len(list(set(tags))))


MAX_LEN = get_max_length(corpus)
N_tags = number_of_tags(corpus)

train = reformat_data(train_data,"data")
test = reformat_data(test_data,"data")

In [0]:
# create a tokenizer
token_word = text.Tokenizer(char_level=False, lower=True, filters="}", oov_token='UNK')
token_word.fit_on_texts(train)

# convert text to sequence of tokens and pad them to ensure equal length vectors 
X_train = sequence.pad_sequences(token_word.texts_to_sequences(train), maxlen=MAX_LEN,padding="post")
X_test = sequence.pad_sequences(token_word.texts_to_sequences(test), maxlen=MAX_LEN,padding="post")

In [0]:
train = reformat_data(train_data,"tags")
test = reformat_data(test_data,"tags")

# create a tokenizer
token_tag = text.Tokenizer(char_level=False, lower=False, filters="}")
token_tag.fit_on_texts(train)

# convert text to sequence of tokens and pad them to ensure equal length vectors 
Y = sequence.pad_sequences(token_tag.texts_to_sequences(train), maxlen=MAX_LEN,padding="post")
# One-Hot encode categorical labels
Y_train = [to_categorical(i, num_classes=N_tags+1) for i in Y]

Y = sequence.pad_sequences(token_tag.texts_to_sequences(test), maxlen=MAX_LEN,padding="post")
# One-Hot encode categorical labels
Y_test = [to_categorical(i, num_classes=N_tags+1) for i in Y]

#add padding 
token_tag.index_word[0]="PAD"
sub_label = list(token_tag.index_word.values())
sub_label.remove('O')
sub_label.remove('PAD')

## Building LSTM-CRF Model

In [0]:
# Model definition
EMBEDDING=50

input = Input(shape=(MAX_LEN,))
emb = Embedding(len(token_word.word_index) + 1, output_dim=EMBEDDING, 
                  input_length=MAX_LEN, mask_zero=True,trainable=True )(input)

lstm1 = Bidirectional(LSTM(units=300, return_sequences=True,
                           recurrent_dropout=0.1, dropout=0.2))(emb)

lstm2 = Bidirectional(LSTM(units=300, return_sequences=True,
                           recurrent_dropout=0.1, dropout=0.2))(lstm1)

model = TimeDistributed(Dense(50, activation="relu"))(lstm2)
crf = CRF(N_tags+1)  # CRF layer
out = crf(model)  # output
model = Model(input, out)

In [28]:
model.compile(optimizer="adam", loss=crf_loss,metrics=[crf_viterbi_accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 124)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 124, 50)           858250    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 124, 600)          842400    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 124, 600)          2162400   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 124, 50)           30050     
_________________________________________________________________
crf_2 (CRF)                  (None, 124, 9)            558       
Total params: 3,893,658
Trainable params: 3,893,658
Non-trainable params: 0
_________________________________________________________________


In [29]:
%%time
BATCH_SIZE = 200
EPOCHS=20
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(X_train, np.array(Y_train), 
                    batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    validation_split=0.2, 
                    verbose=1,
                    callbacks=[early_stopping])

Train on 11232 samples, validate on 2809 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
CPU times: user 15min 50s, sys: 1min 22s, total: 17min 12s
Wall time: 10min 11s


## Predict on Training Set

In [30]:
pred = model.predict(X_train, verbose=1)



In [31]:
preds = np.argmax(pred, axis=-1)
y_tr_true = np.argmax(Y_train, -1)

# Convert the index to tag
pred_tag = [[token_tag.index_word[i] for i in row] for row in preds]
y_tr_true_tag = [[token_tag.index_word[i] for i in row] for row in y_tr_true]
                 
                 
report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag,labels=sub_label)
print(report)
#F1 Score
score=flat_f1_score(y_pred=pred_tag, y_true=y_tr_true_tag,average='micro',labels=sub_label)
print("F1=%.2f"%(score*100))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       I-PER       0.98      0.94      0.96     11128
       I-ORG       0.83      0.81      0.82     10001
       I-LOC       0.88      0.93      0.91      8286
      I-MISC       0.86      0.77      0.82      4556
      B-MISC       0.00      0.00      0.00        37
       B-ORG       1.00      0.96      0.98        24
       B-LOC       0.00      0.00      0.00        11

   micro avg       0.90      0.88      0.89     34043
   macro avg       0.65      0.63      0.64     34043
weighted avg       0.90      0.88      0.89     34043

F1=88.66


## Predict on Test Set

In [32]:
pred = model.predict(X_test, verbose=1)



In [33]:
# Test Eval
#pred_cat = model.predict(X_tr)
preds = np.argmax(pred, axis=-1)
y_tr_true = np.argmax(Y_test, -1)

# Convert the index to tag
pred_tag = [[token_tag.index_word[i] for i in row] for row in preds]
y_tr_true_tag = [[token_tag.index_word[i] for i in row] for row in y_tr_true]
                 
                 
report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag, labels=sub_label)
print(report)

score=flat_f1_score(y_pred=pred_tag, y_true=y_tr_true_tag,average='micro', labels=sub_label)
print("F1=%.2f"%(score*100))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       I-PER       0.89      0.76      0.82      2773
       I-ORG       0.52      0.62      0.57      2491
       I-LOC       0.79      0.79      0.79      1919
      I-MISC       0.69      0.58      0.63       909
      B-MISC       0.00      0.00      0.00         9
       B-ORG       0.00      0.00      0.00         5
       B-LOC       0.00      0.00      0.00         6

   micro avg       0.71      0.70      0.71      8112
   macro avg       0.41      0.39      0.40      8112
weighted avg       0.73      0.70      0.71      8112

F1=70.69
