In [39]:
!pip install seqeval

In [40]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import numpy as np

from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

In [41]:
MAX_LEN= 128
TRAIN_BATCH_SIZE = 32
VALID_BTCH_SIZE = 8
EPOCHS = 10
BERT_MODEL = 'bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

In [42]:
def get_train_data(text, tags):

    tokenized_text = []
    target_tags = []

    for index, token in enumerate(text):

        encoded_token = TOKENIZER.encode(
            token,
            add_special_tokens = False
        )

        encoded_token_len = len(encoded_token)

        tokenized_text.extend(encoded_token)
        target_tags.extend([tags[index]] * encoded_token_len)

    #truncation
    tokenized_text = tokenized_text[: MAX_LEN - 2]
    target_tags = target_tags[: MAX_LEN - 2]

    #[101] = [CLS] , [102] = [SEP]
    tokenized_text = [101] + tokenized_text + [102]
    target_tags = [0] + target_tags + [0]
    attention_mask = [1] * len(tokenized_text)
    token_type_ids = [0] * len(tokenized_text)

    #padding
    padding_len = int(MAX_LEN - len(tokenized_text))

    tokenized_text = tokenized_text + ([0] * padding_len)
    target_tags = target_tags + ([0] * padding_len)
    attention_mask = attention_mask + ([0] * padding_len)
    token_type_ids = token_type_ids + ([0] * padding_len)

    return (tokenized_text, target_tags, attention_mask,  token_type_ids)

In [43]:
class RetrieveSentence(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        function = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(function)
        self.sentences = [s for s in self.grouped]
    
    def retrieve(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [44]:
df_data = pd.read_csv(TRAINING_FILE,sep=",",encoding="latin1").fillna(method='ffill')
Sentences = RetrieveSentence(df_data)

sentences_list = [" ".join([s[0] for s in sent]) for sent in Sentences.sentences]
labels = [ [s[2] for s in sent] for sent in Sentences.sentences]

tags_2_val = list(set(df_data["Tag"]))
tag_2_idx = {t: i for i, t in enumerate(tags_2_val)}

In [45]:
tag_2_idx

{'O': 0,
 'B-tim': 1,
 'B-geo': 2,
 'I-art': 3,
 'B-eve': 4,
 'I-eve': 5,
 'I-gpe': 6,
 'B-org': 7,
 'B-nat': 8,
 'B-art': 9,
 'I-tim': 10,
 'I-nat': 11,
 'I-geo': 12,
 'B-per': 13,
 'B-gpe': 14,
 'I-org': 15,
 'I-per': 16}

In [46]:
id_labels[0]

[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0]

In [47]:
id_labels = [[tag_2_idx.get(l) for l in lab] for lab in labels]
sentences_list = [sent.split() for sent in sentences_list]

del labels[41770]
del sentences_list[41770]
del id_labels[41770]

In [48]:
encoded_text = []
encoded_labels = []
attention_masks = []
token_type_ids = []

for i in range(len(sentences_list)):

    text, labels, att_mask, tok_type = get_train_data(text = sentences_list[i], tags = id_labels[i])
    encoded_text.append(text)
    encoded_labels.append(labels)
    attention_masks.append(att_mask)
    token_type_ids.append(tok_type)

In [65]:
#attention_masks[0]

In [66]:
#encoded_labels[0]

In [51]:
encoded_text = np.array(encoded_text)
encoded_labels = np.array(encoded_labels)
attention_masks = np.array(attention_masks)
token_type_ids = np.array(token_type_ids)

In [52]:
X_train, X_valid, Y_train, Y_valid = train_test_split(encoded_text, encoded_labels, random_state=20, test_size=0.1)
Mask_train, Mask_valid, Token_ids_train, Token_ids_valid = train_test_split(attention_masks,token_type_ids ,random_state=20, test_size=0.1)

In [53]:
print("X train shape: ", X_train.shape)
print("X_valid shape: ", X_valid.shape)
print("Y_train shape: ", Y_train.shape)
print("Y_valid: ", Y_valid.shape)
print("Mask_train shape: ", Mask_train.shape)
print("Mask_valid shape: ", Mask_valid.shape)
print("Token_ids_valid shape: ", Token_ids_valid.shape)
print("Token_ids_train shape: ", Token_ids_train.shape)

X train shape:  (43162, 128)
X_valid shape:  (4796, 128)
Y_train shape:  (43162, 128)
Y_valid:  (4796, 128)
Mask_train shape:  (43162, 128)
Mask_valid shape:  (4796, 128)
Token_ids_valid shape:  (4796, 128)
Token_ids_train shape:  (43162, 128)


In [54]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y

train_ds = tf.data.Dataset.from_tensor_slices((X_train,Mask_train,Token_ids_train,Y_train)).map(example_to_features).shuffle(1000).batch(32)
test_ds=tf.data.Dataset.from_tensor_slices((X_valid,Mask_valid,Token_ids_valid,Y_valid)).map(example_to_features).batch(1)

In [55]:
config = BertConfig.from_pretrained(BERT_MODEL,num_labels=len(tags_2_val))
model = TFBertForTokenClassification.from_pretrained(BERT_MODEL, from_pt=bool(".bin" in BERT_MODEL), config=config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForTokenClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_75', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
 model.layers[-1].activation = tf.keras.activations.softmax

In [57]:
model.summary()

Model: "tf_bert_for_token_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  13073     
Total params: 109,495,313
Trainable params: 109,495,313
Non-trainable params: 0
_________________________________________________________________


In [58]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [59]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [60]:
history = model.fit(train_ds, epochs=3, validation_data=test_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [76]:
sentence = "Hi , my name is Bob and I live in England"
inputs = TOKENIZER(sentence, return_tensors="tf")
input_ids = inputs["input_ids"]
inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
output = model(inputs)

In [77]:
output

(<tf.Tensor: shape=(13,), dtype=float32, numpy=
 array([2.9295006, 2.9295006, 2.9295006, 2.9295006, 2.9295006, 2.9295006,
        2.9295006, 2.9295006, 2.9295006, 2.9295006, 2.9295006, 2.9295006,
        2.9295006], dtype=float32)>,
 <tf.Tensor: shape=(1, 13, 17), dtype=float32, numpy=
 array([[[1.0000000e+00, 3.7605066e-09, 5.1076641e-09, 7.5045525e-09,
          3.3244767e-09, 3.5249690e-09, 3.2716247e-09, 4.2896966e-09,
          4.0232000e-09, 3.5823919e-09, 2.7173985e-09, 3.7711008e-09,
          3.6975756e-09, 5.0502962e-09, 2.8892921e-09, 5.4763913e-09,
          3.2028369e-09],
         [1.0000000e+00, 3.7605212e-09, 5.1076836e-09, 7.5045667e-09,
          3.3244829e-09, 3.5249690e-09, 3.2716372e-09, 4.2897046e-09,
          4.0232151e-09, 3.5824055e-09, 2.7174036e-09, 3.7711079e-09,
          3.6975900e-09, 5.0503060e-09, 2.8893030e-09, 5.4764224e-09,
          3.2028429e-09],
         [1.0000000e+00, 3.7605137e-09, 5.1076738e-09, 7.5045667e-09,
          3.3244767e-09, 3.5249

In [78]:
import transformers

In [79]:
transformers.__version__

'3.4.0'