<a href="https://colab.research.google.com/github/deanhoperobertson/Masters-/blob/master/Thesis/BiLSTM_Simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import files
src = list(files.upload().values())[0]
open('mylib.py','wb').write(src)
import mylib

Saving prepro.py to prepro (1).py


In [5]:
!pip install sklearn_crfsuite



In [0]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.model_selection import train_test_split

#cusotm packages
from prepro import readfile, readstring

#keras and tensorflow packages
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

#evaluation
from sklearn_crfsuite.metrics import flat_classification_report,flat_f1_score,flat_precision_score


In [0]:
#import data from my github repo
train_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/Data/train.txt"
test_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/Data/test.txt"
train = urllib.request.urlopen(train_url).read()
test = urllib.request.urlopen(test_url).read()
train = train.decode('utf-8')
test = test.decode('utf-8')

#preproces the txt file
train = readstring(train)
test = readstring(test)

#create corpus
corpus = train.copy()
corpus.extend(test)

In [0]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])       

In [18]:
words=list(set(words))
n_words = len(words)
print("Number of words in the dataset: ", n_words)
tags = list(set(tags))
n_tags = len(tags)
print("Number of Labels: ", n_tags)

Number of words in the dataset:  27316
Number of Labels:  9


In [19]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 27066


In [20]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 8


In [32]:
tag2idx

{'B-LOC': 8,
 'B-MISC': 5,
 'B-ORG': 4,
 'B-PER': 3,
 'I-LOC': 6,
 'I-MISC': 9,
 'I-ORG': 2,
 'I-PER': 1,
 'O': 7,
 'PAD': 0}

In [21]:
#Find the maxium length of the all the sentences in the corpus
length = []
for sentence in corpus:
  length.append(len(sentence))

MAX_LEN= max(length)
print("The maxium length of sentence is:",max(length))

The maxium length of sentence is: 124


In [22]:
# Convert each sentence from list of Token to list of word_index
X = [[word2idx[w[0]] for w in s] for s in train]

# Padding each sentence to have the same lenght
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])
X

array([[17334, 27066, 16442, ...,     0,     0,     0],
       [26781, 19924,     0, ...,     0,     0,     0],
       [ 5157, 23435,     0, ...,     0,     0,     0],
       ...,
       [25902,  9027, 18349, ...,     0,     0,     0],
       [10792, 26450,     0, ...,     0,     0,     0],
       [ 4239,   617, 10029, ...,     0,     0,     0]], dtype=int32)

In [23]:
# Convert Tag/Label to tag_index
y = [[tag2idx[w[1]] for w in s] for s in train]

# Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])
y

array([[4, 7, 5, ..., 0, 0, 0],
       [3, 1, 0, ..., 0, 0, 0],
       [8, 7, 0, ..., 0, 0, 0],
       ...,
       [4, 7, 4, ..., 0, 0, 0],
       [7, 7, 0, ..., 0, 0, 0],
       [4, 7, 4, ..., 0, 0, 0]], dtype=int32)

In [0]:

# One-Hot encode
y = [to_categorical(i, num_classes=n_tags+1) for i in y]

In [0]:
#split into test and train subsets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [26]:
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

((12636, 124), (1405, 124), (12636, 124, 10), (1405, 124, 10))

In [27]:
# Model definition
EMBEDDING=40

input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)

model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)

out = TimeDistributed(Dense(10, activation="softmax"))(model)
model = Model(input, out)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 124)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 124, 40)           1092720   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 124, 100)          36400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 124, 10)           1010      
Total params: 1,130,130
Trainable params: 1,130,130
Non-trainable params: 0
_________________________________________________________________


In [0]:
class Metrics(Callback):
    
    def on_train_begin(self, logs=None):
        self.val_precisions = []
        self.val_f1s = []
        self.val_recalls = []
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_precision = np.dot(val_targ, val_predict)/sum(val_predict)
        _val_recall = np.dot(val_targ, val_predict)/sum(val_targ)
        _val_f1 = (2*_val_precision*_val_recall)/(_val_precision + _val_recall)
        self.val_precisions.append(_val_precision)
        self.val_recalls.append(_val_recall)
        self.val_f1s.append(_val_f1)
        print(" — val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return 

In [0]:
metrics=BinaryClassificationMetrics()

In [86]:
%%time
BATCH_SIZE = 500
EPOCHS=20


history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2)

Train on 11372 samples, validate on 1264 samples
Epoch 1/10
 - 43s - loss: 0.0500 - acc: 0.9892 - val_loss: 0.1706 - val_acc: 0.9569
Epoch 2/10
 - 43s - loss: 0.0425 - acc: 0.9906 - val_loss: 0.1695 - val_acc: 0.9554
Epoch 3/10
 - 43s - loss: 0.0365 - acc: 0.9923 - val_loss: 0.1597 - val_acc: 0.9604
Epoch 4/10
 - 43s - loss: 0.0309 - acc: 0.9935 - val_loss: 0.1593 - val_acc: 0.9600
Epoch 5/10
 - 44s - loss: 0.0269 - acc: 0.9943 - val_loss: 0.1590 - val_acc: 0.9595
Epoch 6/10
 - 44s - loss: 0.0234 - acc: 0.9951 - val_loss: 0.1574 - val_acc: 0.9611
Epoch 7/10
 - 43s - loss: 0.0205 - acc: 0.9956 - val_loss: 0.1517 - val_acc: 0.9630
Epoch 8/10
 - 43s - loss: 0.0181 - acc: 0.9962 - val_loss: 0.1526 - val_acc: 0.9642
Epoch 9/10
 - 43s - loss: 0.0157 - acc: 0.9967 - val_loss: 0.1524 - val_acc: 0.9640
Epoch 10/10
 - 43s - loss: 0.0139 - acc: 0.9971 - val_loss: 0.1514 - val_acc: 0.9640
CPU times: user 13min 10s, sys: 1min 4s, total: 14min 15s
Wall time: 7min 14s


In [0]:
model.save("model.1")

## Model Evaluation

#### Training Dataset

In [87]:
# TRain Eval
pred_cat = model.predict(X_tr)
pred = np.argmax(pred_cat, axis=-1)
y_tr_true = np.argmax(y_tr, -1)

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_tr_true_tag = [[idx2tag[i] for i in row] for row in y_tr_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.34      0.98      0.51      6440
      B-MISC       0.53      0.96      0.68      3083
       B-ORG       0.35      0.98      0.52      5701
       B-PER       0.44      0.99      0.61      5904
       I-LOC       0.13      0.94      0.23      1009
      I-MISC       0.05      0.93      0.10      1048
       I-ORG       0.16      0.96      0.28      3357
       I-PER       0.16      0.99      0.27      4047
           O       0.11      1.00      0.19    152163
         PAD       0.00      0.00      0.00   1384112

   micro avg       0.12      0.12      0.12   1566864
   macro avg       0.23      0.87      0.34   1566864
weighted avg       0.02      0.12      0.03   1566864



### Test Dataset

In [88]:
# TRain Eval
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.26      0.88      0.41       700
      B-MISC       0.33      0.84      0.47       355
       B-ORG       0.21      0.87      0.33       620
       B-PER       0.32      0.92      0.48       696
       I-LOC       0.14      0.69      0.24       148
      I-MISC       0.03      0.82      0.06       107
       I-ORG       0.09      0.81      0.16       347
       I-PER       0.09      0.89      0.16       481
           O       0.11      0.99      0.20     17415
         PAD       0.00      0.00      0.00    153351

   micro avg       0.12      0.12      0.12    174220
   macro avg       0.16      0.77      0.25    174220
weighted avg       0.02      0.12      0.03    174220

