<a href="https://colab.research.google.com/github/deanhoperobertson/Afrikaans-Named-Entity-Recognition/blob/master/BiLSTM-CRF%20Model%20(Wiki%20300D).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Afrikaans Bi-LSTM-CRF Model

In [41]:
!sudo pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install sklearn_crfsuite
!pip install keras==2.2.4

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-tx2a00id
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-tx2a00id
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp36-none-any.whl size=101065 sha256=8ead72d51e61dcc24f972922796f4c8f42b317539bd658c842f3ba0154212360
  Stored in directory: /tmp/pip-ephem-wheel-cache-r2zjf1mn/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib


In [0]:
import pandas as pd
import re
import numpy as np
import urllib.request

#keras and tensorflow packages
from keras.layers.merge import add
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,concatenate
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras_contrib.metrics import crf_accuracy


#evaluation
from sklearn_crfsuite.metrics import flat_classification_report,flat_f1_score,flat_precision_score
from sklearn.model_selection import train_test_split


In [0]:
def hasNumbers(text):
    if text.isdigit():
        return "1"
    elif re.search(r'\d',text) and re.search(r'\,|\.',text):
        return "1" 
    else:
        if re.search(r'\d', text):
            return(re.sub('\d','D', text))
        else:
            return text

def readstring(filename, meth):
    f = filename.split('\n')
    sentences = []
    sentence = []
    for line in f:
        if line == '\r' or line.startswith('-DOCSTART') or line  =="":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split('\t')
        if meth.lower()=="numbers":
          sentence.append([hasNumbers(splits[0]), splits[-1].strip()])
        else:
          sentence.append([splits[0], splits[-1].strip('\r')])

    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    return sentences

In [0]:
#import data from my github repo
data_url = "https://raw.githubusercontent.com/deanhoperobertson/Named-Enitty-Recognition/master/Data/Afrikaans/Train.txt"
data = urllib.request.urlopen(data_url).read()
data = data.decode('utf-8')

# #preproces the txt file
data = readstring(data,"Numbers")

train_data,test_data=train_test_split(data,train_size=.8, random_state=0)


In [0]:
def reformat_data(data,meth):
  if meth.lower() == "data":
    i=0
  else: i=1
  train = []
  output= []
  for sentence in data:
    words=[]
    for x in sentence:
      words.append(x[i])
    train.append(words)

  for i in train:
    string = ' '.join(i)
    output.append(string)
  return output

def get_max_length(corpus):
  length = []
  for sentence in corpus:
    length.append(len(sentence))
  return int(max(length))

def number_of_tags(corpus):
  tags=[]
  for sentence in corpus:
    for tag in sentence:
      tags.append(tag[1])
  return int(len(list(set(tags))))


MAX_LEN = get_max_length(data)
N_tags = number_of_tags(data)

train = reformat_data(train_data,"data")
test = reformat_data(test_data,"data")

In [0]:
# create a tokenizer
token_word = text.Tokenizer(char_level=False, lower=True, filters="}", oov_token='UNK')
token_word.fit_on_texts(train)

# convert text to sequence of tokens and pad them to ensure equal length vectors 
X_train = sequence.pad_sequences(token_word.texts_to_sequences(train), maxlen=MAX_LEN,padding="post")
X_test = sequence.pad_sequences(token_word.texts_to_sequences(test), maxlen=MAX_LEN,padding="post")

In [0]:
train = reformat_data(train_data,"tags")
test = reformat_data(test_data,"tags")

# create a tokenizer
token_tag = text.Tokenizer(char_level=False, lower=False, filters="}")
token_tag.fit_on_texts(train)

# convert text to sequence of tokens and pad them to ensure equal length vectors 
Y = sequence.pad_sequences(token_tag.texts_to_sequences(train), maxlen=MAX_LEN,padding="post")
# One-Hot encode categorical labels
Y_train = [to_categorical(i, num_classes=N_tags+1) for i in Y]

Y = sequence.pad_sequences(token_tag.texts_to_sequences(test), maxlen=MAX_LEN,padding="post")
# One-Hot encode categorical labels
Y_test = [to_categorical(i, num_classes=N_tags+1) for i in Y]

#add padding 
token_tag.index_word[0]="PAD"
sub_label = list(token_tag.index_word.values())
sub_label.remove('OUT')
sub_label.remove('PAD')

## Adding Embeddings

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [69]:

#root_path = "/content/drive/My Drive/wiki.af.300.vec"
root_path = "/content/drive/My Drive/wiki.af.vec"

EMBEDDING=300
embeddings_index={}
f = open(root_path, encoding = "utf-8")
for line in f:
    values = line.split()
    word = ''.join(values[:-EMBEDDING])
    coefs = np.asarray(values[-EMBEDDING:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 95882 word vectors.


In [0]:
#create emedding matrix
word_index = token_word.word_index
embedding_matrix = np.zeros((len(token_word.word_index) + 1, EMBEDDING))
lower =0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
      embedding_vector = embeddings_index.get(word.lower)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        lower +=1
      else: 
        pass
    else:
      embedding_matrix[i] = embedding_vector

In [71]:
checks=[]
words=[]
for i in range(0,len(token_word.word_index)+1):
  if embedding_matrix[i][0] == 0.0:
    checks.append(1)
    words.append(list(token_word.word_index.items())[i-1][0])
  else:
    continue

print("Missing words from Embeddings: %d (%.2f%%) \nCase Changed to find: %d" %(len(checks),(len(checks)/len(token_word.word_index)*100),lower))

Missing words from Embeddings: 7004 (39.69%) 
Case Changed to find: 0


# Create Model

In [0]:
# Model definition

input = Input(shape=(MAX_LEN,))

model_2 = Embedding(len(token_word.word_index) + 1,output_dim=EMBEDDING,
                  weights=[embedding_matrix],input_length=MAX_LEN,
                  trainable=False,mask_zero=True)(input)

model_2 = Bidirectional(LSTM(units=300, return_sequences=True,
                           recurrent_dropout=0.1, dropout=0.2))(model_2)

model_2 = Bidirectional(LSTM(units=300, return_sequences=True,
                           recurrent_dropout=0.1, dropout=0.2))(model_2)

model_2 = TimeDistributed(Dense(50, activation="relu"))(model_2)
crf = CRF(N_tags+1)  # CRF layer
out = crf(model_2)  # output
model_2 = Model(input, out)

In [75]:
model_2.compile(optimizer='adam', loss=crf_loss,metrics=[crf_viterbi_accuracy])
model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 396)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 396, 300)          5294700   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 396, 600)          1442400   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 396, 600)          2162400   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 396, 50)           30050     
_________________________________________________________________
crf_2 (CRF)                  (None, 396, 10)           630       
Total params: 8,930,180
Trainable params: 3,635,480
Non-trainable params: 5,294,700
__________________________________________________________

In [76]:
%%time
BATCH_SIZE = 200
EPOCHS=10
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
history = model_2.fit(X_train, np.array(Y_train), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, verbose=1,callbacks=[early_stopping])



Train on 5734 samples, validate on 1434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 45min 11s, sys: 4min 50s, total: 50min 2s
Wall time: 28min 52s


## Predict on Training Set

In [77]:
%%time
pred = model_2.predict(X_train, verbose=1)

CPU times: user 11min 20s, sys: 1min 46s, total: 13min 7s
Wall time: 7min 20s


In [78]:
# TRain Eval
preds = np.argmax(pred, axis=-1)
y_tr_true = np.argmax(Y_train, -1)

# Convert the index to tag
pred_tag = [[token_tag.index_word[i] for i in row] for row in preds]
y_tr_true_tag = [[token_tag.index_word[i] for i in row] for row in y_tr_true]
                 
                 
report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag,labels=sub_label)
print(report)
#F1 Score
score=flat_f1_score(y_pred=pred_tag, y_true=y_tr_true_tag,average='micro',labels=sub_label)
print(score)

              precision    recall  f1-score   support

      I-MISC       0.78      0.71      0.74      5371
      B-MISC       0.72      0.51      0.60      5264
       B-ORG       0.72      0.58      0.64      2831
       I-ORG       0.73      0.73      0.73      2164
      I-PERS       0.84      0.78      0.81      1715
      B-PERS       0.82      0.72      0.77      1627
       B-LOC       0.77      0.60      0.67      1478
       I-LOC       0.78      0.48      0.59       288

   micro avg       0.76      0.64      0.70     20738
   macro avg       0.77      0.64      0.70     20738
weighted avg       0.76      0.64      0.69     20738

0.6955521271866748


# Predict On Test Set

In [79]:
%%time
pred = model_2.predict(X_test, verbose=1)

CPU times: user 2min 53s, sys: 27 s, total: 3min 20s
Wall time: 1min 51s


In [80]:
# Test Eval
#pred_cat = model.predict(X_tr)
preds = np.argmax(pred, axis=-1)
y_tr_true = np.argmax(Y_test, -1)

# Convert the index to tag
pred_tag = [[token_tag.index_word[i] for i in row] for row in preds]
y_tr_true_tag = [[token_tag.index_word[i] for i in row] for row in y_tr_true]
                 
                 
report = flat_classification_report(y_pred=pred_tag, y_true=y_tr_true_tag, labels=sub_label)
print(report)

score=flat_f1_score(y_pred=pred_tag, y_true=y_tr_true_tag,average='micro', labels=sub_label)
print(score)

              precision    recall  f1-score   support

      I-MISC       0.70      0.65      0.67      1357
      B-MISC       0.64      0.47      0.54      1267
       B-ORG       0.62      0.46      0.53       726
       I-ORG       0.68      0.58      0.63       551
      I-PERS       0.77      0.69      0.73       440
      B-PERS       0.80      0.62      0.70       412
       B-LOC       0.72      0.49      0.58       340
       I-LOC       0.68      0.26      0.38        50

   micro avg       0.69      0.56      0.62      5143
   macro avg       0.70      0.53      0.59      5143
weighted avg       0.69      0.56      0.61      5143

0.6161616161616161
