<a href="https://colab.research.google.com/github/deanhoperobertson/Masters-/blob/master/Thesis/Code/Bi_LSTM_CRF_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bi-LSTM-CRF Model

In [1]:
from google.colab import files
src = list(files.upload().values())[0]
open('mylib.py','wb').write(src)
import mylib

Saving prepro.py to prepro (1).py


In [2]:
!sudo pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-tmiextlj
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-tmiextlj
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-fhk_a2m3/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib


In [3]:
!pip install sklearn_crfsuite



In [4]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.model_selection import train_test_split

#cusotm packages
from prepro import readfile, readstring

#keras and tensorflow packages
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss


#evaluation
from sklearn_crfsuite.metrics import flat_classification_report,flat_f1_score,flat_precision_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

Using TensorFlow backend.


In [0]:
#import data from my github repo
train_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/Data/train.txt"
test_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/Data/test.txt"
train = urllib.request.urlopen(train_url).read()
test = urllib.request.urlopen(test_url).read()
train = train.decode('utf-8')
test = test.decode('utf-8')

#preproces the txt file
train = readstring(train)
test = readstring(test)

#create corpus
corpus = train.copy()
corpus.extend(test)

In [6]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])
        
words=list(set(words))
n_words = len(words)
print("Number of words in the dataset: ", n_words)
tags = list(set(tags))
n_tags = len(tags)
print("Number of Labels: ", n_tags)

Number of words in the dataset:  27316
Number of Labels:  9


In [7]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 5934


In [8]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 6


In [9]:
#Find the maxium length of the all the sentences in the corpus
length = []
for sentence in corpus:
  length.append(len(sentence))

MAX_LEN= max(length)
print("The maxium length of sentence is:",max(length))

The maxium length of sentence is: 124


In [10]:
# Convert each sentence from list of Token to list of word_index
X = [[word2idx[w[0]] for w in s] for s in train]

# Padding each sentence to have the same lenght
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])
X

array([[ 1520,  5934,  3388, ...,     0,     0,     0],
       [13175, 14713,     0, ...,     0,     0,     0],
       [15260,  9454,     0, ...,     0,     0,     0],
       ...,
       [ 2209,  8687, 25488, ...,     0,     0,     0],
       [ 8472, 22530,     0, ...,     0,     0,     0],
       [ 1864,  3294,  6496, ...,     0,     0,     0]], dtype=int32)

In [11]:
# Convert Tag/Label to tag_index
y = [[tag2idx[w[1]] for w in s] for s in train]

# Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])
y

array([[8, 5, 9, ..., 0, 0, 0],
       [2, 1, 0, ..., 0, 0, 0],
       [6, 5, 0, ..., 0, 0, 0],
       ...,
       [8, 5, 8, ..., 0, 0, 0],
       [5, 5, 0, ..., 0, 0, 0],
       [8, 5, 8, ..., 0, 0, 0]], dtype=int32)

In [0]:

# One-Hot encode categorical labels
y = [to_categorical(i, num_classes=n_tags+1) for i in y]

In [13]:
#split into test and train subsets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
X_tr.shape,np.array(y_tr).shape, X_te.shape, np.array(y_te).shape

((12636, 124), (12636, 124, 10), (1405, 124), (1405, 124, 10))

## Building LSTM-CRF Model

In [14]:
# Model definition
EMBEDDING=40

input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)

model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)

model = TimeDistributed(Dense(10, activation="relu"))(model)
crf = CRF(n_tags+1)  # CRF layer
out = crf(model)  # output
model = Model(input, out)

W0714 16:02:34.721079 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 16:02:34.744913 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 16:02:34.749391 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 16:02:34.894826 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0714 16:02:34.906877 

In [15]:
model.compile(optimizer="rmsprop", loss=crf_loss)#, metrics=["accuracy"])
model.summary()

W0714 16:02:35.652342 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 124)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 124, 40)           1092720   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 124, 100)          36400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 124, 10)           1010      
_________________________________________________________________
crf_1 (CRF)                  (None, 124, 10)           230       
Total params: 1,130,360
Trainable params: 1,130,360
Non-trainable params: 0
_________________________________________________________________


In [16]:
%%time
BATCH_SIZE = 500
EPOCHS=10

history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=1)

W0714 16:02:38.067074 140584038975360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 11372 samples, validate on 1264 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 4min 16s, sys: 25.2 s, total: 4min 41s
Wall time: 2min 53s


## Evaluation

In [17]:
pred = model.predict(X_tr, verbose=1)



In [18]:
# TRain Eval
pred_cat = model.predict(X_tr)
pred = np.argmax(pred_cat, axis=-1)
y_tr_true = np.argmax(y_tr, -1)

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_tr_true_tag = [[idx2tag[i] for i in row] for row in y_tr_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag)
print(report)

              precision    recall  f1-score   support

       B-LOC       0.47      0.94      0.63      6469
      B-MISC       0.67      0.17      0.27      3121
       B-ORG       0.72      0.51      0.60      5691
       B-PER       0.91      0.60      0.73      5867
       I-LOC       0.54      0.44      0.49      1050
      I-MISC       0.09      0.13      0.11      1062
       I-ORG       0.84      0.29      0.43      3335
       I-PER       0.83      0.74      0.79      4022
           O       0.98      1.00      0.99    153058
         PAD       1.00      1.00      1.00   1383189

    accuracy                           0.99   1566864
   macro avg       0.71      0.58      0.60   1566864
weighted avg       0.99      0.99      0.99   1566864



In [19]:
X_pred = model.predict(X_tr, verbose=1)
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(X_pred)
test_labels = pred2label(y_tr)



In [20]:
print(flat_classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

       B-LOC       0.47      0.94      0.63      6469
      B-MISC       0.67      0.17      0.27      3121
       B-ORG       0.72      0.51      0.60      5691
       B-PER       0.91      0.60      0.73      5867
       I-LOC       0.54      0.44      0.49      1050
      I-MISC       0.09      0.13      0.11      1062
       I-ORG       0.84      0.29      0.43      3335
       I-PER       0.83      0.74      0.79      4022
           O       1.00      1.00      1.00   1536247

    accuracy                           0.99   1566864
   macro avg       0.68      0.54      0.56   1566864
weighted avg       0.99      0.99      0.99   1566864



In [23]:
#F1 Score
score1=flat_f1_score(test_labels, pred_labels,average='weighted')
print("Overalll F1 Score:",score1)

score2=flat_f1_score(test_labels, pred_labels,average='weighted',labels=setlabel)
print("Weighted F1 Score:",score2)

Overalll F1 Score: 0.9908501433759476
Weighted F1 Score: 0.5812756509624446
