<a href="https://colab.research.google.com/github/deanhoperobertson/Masters-/blob/master/NER_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
src = list(files.upload().values())[0]
open('mylib.py','wb').write(src)
import mylib

Saving prepro.py to prepro.py


In [0]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.model_selection import train_test_split

#cusotm packages
from prepro import readfile, readstring

#keras and tensorflow packages
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

#evaluation
from sklearn_crfsuite.metrics import flat_classification_report

In [0]:
#import data from my github repo
train_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/train.txt"
test_url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/Code/test.txt"
train = urllib.request.urlopen(train_url).read()
test = urllib.request.urlopen(test_url).read()
train = train.decode('utf-8')
test = test.decode('utf-8')

#preproces the txt file
train = readstring(train)
test = readstring(test)

MAX_LEN =113

In [0]:
words = []
tags = []
for sentence in train:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])

In [7]:
words=list(set(words))
n_words = len(words)
print("Number of words in the dataset: ", n_words)

Number of words in the dataset:  23623


In [8]:
tags = list(set(tags))
n_tags = len(tags)
print("Number of Labels: ", n_tags)

Number of Labels:  9


In [9]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 15942


In [32]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 1


In [0]:
# Convert each sentence from list of Token to list of word_index
X = [[word2idx[w[0]] for w in s] for s in train]

# Padding each sentence to have the same lenght
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])

In [12]:
X

array([[12018, 15942, 16864, ...,     0,     0,     0],
       [14948, 14472,     0, ...,     0,     0,     0],
       [18475,  1134,     0, ...,     0,     0,     0],
       ...,
       [16958,  7696, 18492, ...,     0,     0,     0],
       [13008, 15484,     0, ...,     0,     0,     0],
       [ 4130,  5172, 21924, ...,     0,     0,     0]], dtype=int32)

In [0]:
# Convert Tag/Label to tag_index
y = [[tag2idx[w[1]] for w in s] for s in train]

# Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])


In [0]:
# One-Hot encode
y = [to_categorical(i, num_classes=n_tags+1) for i in y]

In [0]:
#split into test and train subsets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [16]:
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

((12636, 113), (1405, 113), (12636, 113, 10), (1405, 113, 10))

##Build Bi-LSTM Model

In [20]:
# Model definition
EMBEDDING=40

input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)

model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)

out = TimeDistributed(Dense(10, activation="softmax"))(model)
model = Model(input, out)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 113)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 113, 40)           945000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 113, 100)          36400     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 113, 10)           1010      
Total params: 982,410
Trainable params: 982,410
Non-trainable params: 0
_________________________________________________________________


In [21]:
BATCH_SIZE = 500
EPOCHS=5


history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2)

Instructions for updating:
Use tf.cast instead.
Train on 11372 samples, validate on 1264 samples
Epoch 1/5
 - 20s - loss: 1.4127 - acc: 0.7988 - val_loss: 0.9190 - val_acc: 0.8319
Epoch 2/5
 - 16s - loss: 0.8415 - acc: 0.8329 - val_loss: 0.7809 - val_acc: 0.8319
Epoch 3/5
 - 16s - loss: 0.7297 - acc: 0.8331 - val_loss: 0.6821 - val_acc: 0.8321
Epoch 4/5
 - 16s - loss: 0.6264 - acc: 0.8332 - val_loss: 0.5788 - val_acc: 0.8322
Epoch 5/5
 - 16s - loss: 0.5161 - acc: 0.8355 - val_loss: 0.4785 - val_acc: 0.8376


### Model Evaulation

In [0]:

# Eval
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)

In [27]:
# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00       706
      B-MISC       0.00      0.00      0.00       343
       B-ORG       0.01      0.02      0.02       708
       B-PER       0.21      0.17      0.19       649
       I-LOC       0.00      0.00      0.00       114
      I-MISC       0.01      0.01      0.01       107
       I-ORG       0.00      0.00      0.00       403
       I-PER       0.00      0.00      0.00       454
           O       0.11      1.00      0.20     17164
         PAD       0.00      0.00      0.00    138117

   micro avg       0.11      0.11      0.11    158765
   macro avg       0.04      0.12      0.04    158765
weighted avg       0.01      0.11      0.02    158765



In [34]:
y_te_true

array([[8, 6, 8, ..., 0, 0, 0],
       [8, 8, 8, ..., 0, 0, 0],
       [3, 8, 8, ..., 0, 0, 0],
       ...,
       [3, 8, 8, ..., 0, 0, 0],
       [8, 8, 8, ..., 0, 0, 0],
       [3, 8, 8, ..., 0, 0, 0]])

array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8])

In [39]:
print('After processing, sample:', X[0])

After processing, sample: [12018 15942 16864 10476 13388 17859  7231  3711 15150     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


In [40]:
print('After processing, labels:', y[0])

After processing, labels: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
