In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
import os
import matplotlib.pyplot as plt
import time

import preprocess_utils as pre

In [2]:
data = pre.read_iob_file(os.path.join("conll2003_data", "train.txt"))
print(f"Conll2003 dataset features: {set(data.keys())}")

Conll2003 dataset features: {'pos_tags', 'chunk_tags', 'tokens', 'entity_tags'}


In [3]:
train_text = data["tokens"]
train_labels = data["entity_tags"]
print(f"Train dataset: {len(train_text)}")

Train dataset: 14018


In [4]:
data = pre.read_iob_file(os.path.join("conll2003_data", "valid.txt"))
valid_text = data["tokens"]
valid_labels = data["entity_tags"]
print(f"Validation dataset: {len(valid_text)}")

Validation dataset: 3242


In [5]:
data = pre.read_iob_file(os.path.join("conll2003_data", "test.txt"))
test_text = data["tokens"]
test_labels = data["entity_tags"]
print(f"Test dataset: {len(test_text)}")

Test dataset: 3450


In [6]:
print(f"Sentence: {train_text[0]}")
print(f"Labels:   {train_labels[0]}")

Sentence: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb']
Labels:   ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']


In [7]:
def get_unique_labels(labels):
    unique_labels = set()
    for sentence_labels in labels:
        unique_labels.update(sentence_labels)
    return unique_labels

unique_labels = sorted(get_unique_labels(train_labels))
label_to_idx = {label: idx+1 for idx, label in enumerate(unique_labels)}
idx_to_label = {idx+1: label for idx, label in enumerate(unique_labels)}

label_to_idx

{'B-LOC': 1,
 'B-MISC': 2,
 'B-ORG': 3,
 'B-PER': 4,
 'I-LOC': 5,
 'I-MISC': 6,
 'I-ORG': 7,
 'I-PER': 8,
 'O': 9}

In [9]:
MAX_SEQ_LENGTH = 30

preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
tokenizer = hub.KerasLayer(preprocessor.tokenize)
packer = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=MAX_SEQ_LENGTH)
)

In [10]:
special_tokens = preprocessor.tokenize.get_special_tokens_dict()
print("Special tokens:")
for key, value in special_tokens.items():
    print(f"    {key:20}: {value}")

Special tokens:
    padding_id          : 0
    end_of_segment_id   : 102
    vocab_size          : 30522
    mask_id             : 103
    start_of_sequence_id: 101


In [11]:
TOKEN_START = special_tokens["start_of_sequence_id"]
TOKEN_END = special_tokens["end_of_segment_id"]
TOKEN_PAD = special_tokens["padding_id"]

batch_size = 32

In [12]:
# Merge the last two dimensions of the tokens and
# compute in how many subtoken a word was divided
def merge_dims_and_get_tokens_length(tokens):
    tokens = tokens.merge_dims(-2, -1)
    num_subtokens_per_token = tf.map_fn(lambda token: tf.size(token), tokens, fn_output_signature=tf.int32)
    return tokens, num_subtokens_per_token

# Build a lookup table for labels
init = tf.lookup.KeyValueTensorInitializer(
    keys=unique_labels,
    values=tf.range(1, len(unique_labels) + 1, dtype=tf.int64)
)
table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets=1)

def preprocess_dataset(tokens_info, labels):
    tokens, num_subtokens_per_token = tokens_info
    
    # Bert packer works on batches so a new dimension is necessary
    tokens = tf.expand_dims(tokens, axis=0)
    packed_tokens = packer([tokens])
    # The bert packer already has the output as a batch. This is necessary because
    # we are preprocessing a list of string words (sentence) instead of a list of
    # string sentences. Thus if we want to batch this sentences we have to reshape.
    packed_tokens["input_word_ids"] = tf.reshape(packed_tokens["input_word_ids"], (-1,))
    packed_tokens["input_type_ids"] = tf.reshape(packed_tokens["input_type_ids"], (-1,))
    packed_tokens["input_mask"] = tf.reshape(packed_tokens["input_mask"], (-1,))
    
    # Convert labels to ids and aligne them to the number of subtokens
    labels_ids = table.lookup(labels)
    aligned_labels_ids = tf.repeat(labels_ids, num_subtokens_per_token)
    # The first token in packed tokens is TOKEN_START and the last is TOKEN_END
    # Also the packed tokens are padded to MAX_SEQUENCE_LENGTH
    logical_pos = tf.logical_and(
        tf.not_equal(packed_tokens["input_word_ids"], TOKEN_START),
        tf.not_equal(packed_tokens["input_word_ids"], TOKEN_END)
    )
    logical_pos = tf.logical_and(
        logical_pos,
        tf.not_equal(packed_tokens["input_word_ids"], TOKEN_PAD)
    )
    
    shape = packed_tokens["input_word_ids"].shape
    # When the labels are of length MAX_SEMAX_SEQ_LENGTH ingnore the last
    # two labels because they are also discarded by the bert packer in favor
    # of the START_TOKEN and END_TOKEN
    labels_end_index = MAX_SEQ_LENGTH - 2
    aligned_labels_ids = tf.scatter_nd(
        tf.where(logical_pos),
        aligned_labels_ids[:labels_end_index],
        shape
    )
    return packed_tokens, aligned_labels_ids


In [13]:
# Train dataset
train_text_ragged_tensors = tf.ragged.constant(train_text)
train_labels_ragged_tensors = tf.ragged.constant(train_labels)

train_text_dataset = (tf.data.Dataset.from_tensor_slices(train_text_ragged_tensors)
                .map(lambda text: tokenizer(text))
                .map(merge_dims_and_get_tokens_length)
               )

train_labels_dataset = tf.data.Dataset.from_tensor_slices(train_labels_ragged_tensors)

train_dataset = (tf.data.Dataset.zip((train_text_dataset, train_labels_dataset))
                 .map(preprocess_dataset)
                 .batch(batch_size)
                 .cache()
                )

# Validation dataset
valid_text_ragged_tensors = tf.ragged.constant(valid_text)
valid_labels_ragged_tensors = tf.ragged.constant(valid_labels)

valid_text_dataset = (tf.data.Dataset.from_tensor_slices(valid_text_ragged_tensors)
                .map(lambda text: tokenizer(text))
                .map(merge_dims_and_get_tokens_length)
               )

valid_labels_dataset = tf.data.Dataset.from_tensor_slices(valid_labels_ragged_tensors)

valid_dataset = (tf.data.Dataset.zip((valid_text_dataset, valid_labels_dataset))
                 .map(preprocess_dataset)
                 .batch(batch_size)
                 .cache()
                )

# Test dataset
test_text_ragged_tensors = tf.ragged.constant(test_text)
test_labels_ragged_tensors = tf.ragged.constant(test_labels)

test_text_dataset = (tf.data.Dataset.from_tensor_slices(test_text_ragged_tensors)
                .map(lambda text: tokenizer(text))
                .map(merge_dims_and_get_tokens_length)
               )

test_labels_dataset = tf.data.Dataset.from_tensor_slices(test_labels_ragged_tensors)

test_dataset = (tf.data.Dataset.zip((test_text_dataset, test_labels_dataset))
                 .map(preprocess_dataset)
                 .batch(batch_size)
                 .cache()
                )

In [14]:
class IgnorePaddingSparseCategoricalCrossentropyLoss(tf.keras.losses.Loss):
    def __init__(self, from_logits=False, **kwargs):
        super().__init__(**kwargs)
        self.loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=from_logits,
            reduction=tf.keras.losses.Reduction.NONE
        )
    
    def call(self, y_true, y_pred):
        loss = self.loss_fn(y_true, y_pred)
        mask = tf.cast(tf.not_equal(y_true,  0), dtype=tf.dtypes.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [15]:
class IgnorePaddingSparseCategoricalAccuracy(tf.keras.metrics.Metric):
    def __init__(self, name="accuracy", **kwargs):
        super(IgnorePaddingSparseCategoricalAccuracy, self).__init__(name=name, **kwargs)
        self.total = self.add_weight(name="total", initializer="zeros")
        self.count = self.add_weight(name="count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        labels = tf.math.argmax(y_pred, axis=2)
        # labels = tf.cast(y_pred, dtype=tf.dtypes.int64)
        mask = tf.not_equal(y_true, 0)
        correct_predictions = tf.equal(y_true, labels)
        correct_predictions = tf.cast(tf.logical_and(mask, correct_predictions),
                                      dtype=tf.dtypes.float32)
        total_labels = tf.cast(mask, dtype=tf.dtypes.float32)
        self.count.assign_add(tf.reduce_sum(correct_predictions))
        self.total.assign_add(tf.reduce_sum(total_labels))
    
    def result(self):
        return self.count / self.total
    
    def reset_state(self):
        self.total.assign(0.0)
        self.count.assign(0.0)

In [17]:
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/2",
    trainable=True
)

In [18]:
encoder_inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32),
)
encoder_outputs = encoder(encoder_inputs)
outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(unique_labels) + 1))(encoder_outputs["sequence_output"])

model = tf.keras.Model(inputs=encoder_inputs, outputs=outputs)

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'sequence_output':  11170561    ['input_2[0][0]',                
                                 (None, 30, 256),                 'input_3[0][0]',            

In [23]:
epochs = 1
train_data_size = len(train_text)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(0.1 * num_train_steps)
initial_learning_rate = 2e-5

linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=2e-5,
    end_learning_rate=0,
    decay_steps=num_train_steps
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss=IgnorePaddingSparseCategoricalCrossentropyLoss(from_logits=True),
    metrics=[IgnorePaddingSparseCategoricalAccuracy()]
)

In [None]:
history = model.fit(
    train_dataset,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=1)]
)



In [None]:
fig, ax = plt.subplots(figsize=(10, 6), layout="constrained")

ax.plot(history.history["loss"], label="Training loss")
ax.plot(history.history["val_loss"], label="Validation loss")
ax.set_xlabel("Epochs")
ax.set_ylabel("Loss")
ax.legend()
plt.show()

In [3]:
ner_model = tf.keras.models.load_model("ner_model")

