# Overview
* https://keras.io/examples/nlp/ner_transformers/

# Dependencies

In [1]:
# general
import os

# data
import numpy as np
import pandas as pd

# ml
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datasets import load_dataset
from collections import Counter
from conlleval import evaluate

# Constants

In [2]:
PATH_CSV = "../nba_api_chatbot/data/questions.csv"
PATH_TXT = "../nba_api_chatbot/data/questions.txt"
PATH_VOCABULARY = "../nba_api_chatbot/data/vocabulary.txt"

PATH_MODEL = "../nba_api_chatbot/models/ner"

# Classes

In [3]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

In [5]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super(NERModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

# Data

In [6]:
# conll_data = load_dataset("conll2003")

In [7]:
# conll_data['train'].to_pandas()

In [8]:
# def export_to_file(export_file_path, data):
#     with open(export_file_path, "w") as f:
#         for record in data:
#             ner_tags = record["ner_tags"]
#             tokens = record["tokens"]
#             if len(tokens) > 0:
#                 f.write(
#                     str(len(tokens))
#                     + "\t"
#                     + "\t".join(tokens)
#                     + "\t"
#                     + "\t".join(map(str, ner_tags))
#                     + "\n"
#                 )

# os.mkdir("data")
# export_to_file("./data/conll_train.txt", conll_data["train"])
# export_to_file("./data/conll_val.txt", conll_data["validation"])

In [9]:
questions = pd.read_csv(PATH_CSV, header=None)
questions.columns = ["question", "player_full_name", "stat_col", "tokens", "ner_tags"]
questions

Unnamed: 0,question,player_full_name,stat_col,tokens,ner_tags
0,how many GP did Alaa Abdelnaby have,Alaa Abdelnaby,GP,"['how', 'many', 'GP', 'did', 'Alaa', 'Abdelnab...","[1, 1, 4, 1, 2, 3, 1]"
1,how many GP does Alaa Abdelnaby have,Alaa Abdelnaby,GP,"['how', 'many', 'GP', 'does', 'Alaa', 'Abdelna...","[1, 1, 4, 1, 2, 3, 1]"
2,how many games played did Alaa Abdelnaby have,Alaa Abdelnaby,GP,"['how', 'many', 'games', 'played', 'did', 'Ala...","[1, 1, 4, 5, 1, 2, 3, 1]"
3,how many games played does Alaa Abdelnaby have,Alaa Abdelnaby,GP,"['how', 'many', 'games', 'played', 'does', 'Al...","[1, 1, 4, 5, 1, 2, 3, 1]"
4,how many games did Alaa Abdelnaby have,Alaa Abdelnaby,GP,"['how', 'many', 'games', 'did', 'Alaa', 'Abdel...","[1, 1, 4, 1, 2, 3, 1]"
...,...,...,...,...,...
1048501,how many PT does Matt Zunic have,Matt Zunic,PTS,"['how', 'many', 'PT', 'does', 'Matt', 'Zunic',...","[1, 1, 4, 1, 2, 3, 1]"
1048502,how many points did Matt Zunic have,Matt Zunic,PTS,"['how', 'many', 'points', 'did', 'Matt', 'Zuni...","[1, 1, 4, 1, 2, 3, 1]"
1048503,how many points does Matt Zunic have,Matt Zunic,PTS,"['how', 'many', 'points', 'does', 'Matt', 'Zun...","[1, 1, 4, 1, 2, 3, 1]"
1048504,how many point did Matt Zunic have,Matt Zunic,PTS,"['how', 'many', 'point', 'did', 'Matt', 'Zunic...","[1, 1, 4, 1, 2, 3, 1]"


In [10]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    # ner_labels = ["PER", "ORG", "LOC", "MISC"]
    ner_labels = ["PLAYER", "STAT"]
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    # all_labels = ["[PAD]", "O"] + all_labels
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PLAYER', 3: 'I-PLAYER', 4: 'B-STAT', 5: 'I-STAT'}


In [11]:
def convert_text_to_list(text, type = str):
    if type == str:
        return [x[1:-1] for x in text.strip("[]").split(", ")]
    elif type == int:
        return [int(x) for x in text.strip("[]").split(", ")]
    else:
        raise Exception(f"type not supported: {str(type)}")

questions['tokens_parsed'] = questions.apply(lambda x: convert_text_to_list(x['tokens']), axis=1)
questions['ner_tags_parsed'] = questions.apply(lambda x: convert_text_to_list(x['ner_tags'], type=int), axis=1)

In [12]:
# all_tokens = sum(conll_data["train"]["tokens"], [])
# all_tokens = sum(tokens, [])
all_tokens = [token for question in questions['tokens_parsed'].tolist() for token in question]
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

In [13]:
all_tags = [tag for ner_tags in questions['ner_tags_parsed'].tolist() for tag in ner_tags]
all_tags_array = np.array(list(all_tags))

In [14]:
set(all_tags_array)

{1, 2, 3, 4, 5}

In [15]:
# all_tokens = sum(conll_data["train"]["tokens"], [])
# all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens_array)
print(len(counter))

num_tags = len(mapping)
vocab_size = 20000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(
    vocabulary=vocabulary
)

4221


In [16]:
# save vocabulary
with open(PATH_VOCABULARY, 'w') as f:
    for item in vocabulary:
        f.write("%s\n" % item)

In [17]:
# text = "how many field goals did michael jordan have"
# tokens = text.split(" ")
# lookup_layer(tf.strings.lower(tokens))

In [18]:
all_data = tf.data.TextLineDataset(PATH_TXT)
num_lines = sum(1 for line in open(PATH_TXT))
num_lines

1048506

In [19]:
print(list(all_data.take(1).as_numpy_iterator()))

[b'7\thow\tmany\tGP\tdid\tAlaa\tAbdelnaby\thave\t1\t1\t4\t1\t2\t3\t1']


In [20]:
# https://towardsdatascience.com/how-to-split-a-tensorflow-dataset-into-train-validation-and-test-sets-526c8dd29438
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, test_split=0.2, shuffle=True, shuffle_size=10000):
    
    assert (train_split + test_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    
    train_ds = ds.take(train_size)
    test_ds = ds.skip(train_size)
    
    return train_ds, test_ds

In [21]:
train_data, test_data = get_dataset_partitions_tf(all_data, num_lines)

In [22]:
def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    # tags += 1
    return tokens, tags


def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)


# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 32
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)
test_dataset = (
    test_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [23]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)


loss = CustomNonPaddingTokenLoss()

In [24]:
ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(train_dataset, epochs=10)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a4d1813130>

# Predict

In [34]:
def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)


# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    "how many field goals did michael jordan have"
)
sample_input = tf.reshape(sample_input, shape=[1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

# eu -> B-ORG, german -> B-MISC, british -> B-MISC
print(prediction)

tf.Tensor([[  1   2  19  23   4  90 108   3]], shape=(1, 8), dtype=int64)
['O', 'O', 'B-STAT', 'I-STAT', 'O', 'B-PLAYER', 'I-PLAYER', 'O']


In [35]:
mapping

{0: '[PAD]', 1: 'O', 2: 'B-PLAYER', 3: 'I-PLAYER', 4: 'B-STAT', 5: 'I-STAT'}

# Model Summary

In [36]:
ner_model.summary()

Model: "ner_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  multiple                 644096    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  multiple                 21120     
 merBlock)                                                       
                                                                 
 dropout_2 (Dropout)         multiple                  0         
                                                                 
 dense_2 (Dense)             multiple                  2112      
                                                                 
 dropout_3 (Dropout)         multiple                  0         
                                                         

# Save Model

In [37]:
ner_model.save(PATH_MODEL)



INFO:tensorflow:Assets written to: ../nba-api-chatbot/models/ner\assets


INFO:tensorflow:Assets written to: ../nba-api-chatbot/models/ner\assets


# Calculate Metrics
* from conlleval import evaluate
    * did not work for me
* so I pulled the essential funcs from:<br>
https://github.com/sighsmile/conlleval/blob/master/conlleval.py

In [38]:
all_true_tag_ids, all_predicted_tag_ids = [], []

for x, y in test_dataset:
    
    output = ner_model.predict(x)
    predictions = np.argmax(output, axis=-1)
    predictions = np.reshape(predictions, [-1])

    true_tag_ids = np.reshape(y, [-1])

    mask = (true_tag_ids > 0) & (predictions > 0)
    true_tag_ids = true_tag_ids[mask]
    predicted_tag_ids = predictions[mask]

    all_true_tag_ids.append(true_tag_ids)
    all_predicted_tag_ids.append(predicted_tag_ids)

all_true_tag_ids = np.concatenate(all_true_tag_ids)
all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
real_tags = [mapping[tag] for tag in all_true_tag_ids]

# Evaluate

In [30]:
# evaluate(real_tags, predicted_tags, True)

In [39]:
from collections import defaultdict

true_seqs = real_tags
pred_seqs = predicted_tags

def is_chunk_start(prev_tag, tag):
    """
    check if a new chunk started between the previous and current word
    """
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)

    if prefix2 == 'O':
        return False
    if prefix1 == 'O':
        return prefix2 != 'O'

    if chunk_type1 != chunk_type2:
        return True

    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']

def is_chunk_end(prev_tag, tag):
    """
    check if the previous chunk ended between the previous and current word
    e.g. 
    (B-PER, I-PER) -> False
    (B-LOC, O)  -> True
    Note: in case of contradicting tags, e.g. (B-PER, I-LOC)
    this is considered as (B-PER, B-LOC)
    """
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)

    if prefix1 == 'O':
        return False
    if prefix2 == 'O':
        return prefix1 != 'O'

    if chunk_type1 != chunk_type2:
        return True

    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']

def split_tag(chunk_tag):
    """
    split chunk tag into IOBES prefix and chunk_type
    e.g. 
    B-PER -> (B, PER)
    O -> (O, None)
    """
    if chunk_tag == 'O':
        return ('O', None)
    return chunk_tag.split('-', maxsplit=1)

def count_chunks(true_seqs, pred_seqs):
    """
    true_seqs: a list of true tags
    pred_seqs: a list of predicted tags
    return: 
    correct_chunks: a dict (counter), 
                    key = chunk types, 
                    value = number of correctly identified chunks per type
    true_chunks:    a dict, number of true chunks per type
    pred_chunks:    a dict, number of identified chunks per type
    correct_counts, true_counts, pred_counts: similar to above, but for tags
    """
    correct_chunks = defaultdict(int)
    true_chunks = defaultdict(int)
    pred_chunks = defaultdict(int)

    correct_counts = defaultdict(int)
    true_counts = defaultdict(int)
    pred_counts = defaultdict(int)

    prev_true_tag, prev_pred_tag = 'O', 'O'
    correct_chunk = None

    for true_tag, pred_tag in zip(true_seqs, pred_seqs):
        if true_tag == pred_tag:
            correct_counts[true_tag] += 1
        true_counts[true_tag] += 1
        pred_counts[pred_tag] += 1

        _, true_type = split_tag(true_tag)
        _, pred_type = split_tag(pred_tag)

        if correct_chunk is not None:
            true_end = is_chunk_end(prev_true_tag, true_tag)
            pred_end = is_chunk_end(prev_pred_tag, pred_tag)

            if pred_end and true_end:
                correct_chunks[correct_chunk] += 1
                correct_chunk = None
            elif pred_end != true_end or true_type != pred_type:
                correct_chunk = None

        true_start = is_chunk_start(prev_true_tag, true_tag)
        pred_start = is_chunk_start(prev_pred_tag, pred_tag)

        if true_start and pred_start and true_type == pred_type:
            correct_chunk = true_type
        if true_start:
            true_chunks[true_type] += 1
        if pred_start:
            pred_chunks[pred_type] += 1

        prev_true_tag, prev_pred_tag = true_tag, pred_tag
    if correct_chunk is not None:
        correct_chunks[correct_chunk] += 1

    return (correct_chunks, true_chunks, pred_chunks, 
        correct_counts, true_counts, pred_counts)

(correct_chunks, true_chunks, pred_chunks, correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)

In [40]:
def calc_metrics(tp, p, t, percent=True):
    """
    compute overall precision, recall and FB1 (default values are 0.0)
    if percent is True, return 100 * original decimal value
    """
    precision = tp / p if p else 0
    recall = tp / t if t else 0
    fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    if percent:
        return 100 * precision, 100 * recall, 100 * fb1
    else:
        return precision, recall, fb1

def get_result(correct_chunks, true_chunks, pred_chunks,
    correct_counts, true_counts, pred_counts, verbose=True):
    """
    if verbose, print overall performance, as well as preformance per chunk type;
    otherwise, simply return overall prec, rec, f1 scores
    """
    # sum counts
    sum_correct_chunks = sum(correct_chunks.values())
    sum_true_chunks = sum(true_chunks.values())
    sum_pred_chunks = sum(pred_chunks.values())

    sum_correct_counts = sum(correct_counts.values())
    sum_true_counts = sum(true_counts.values())

    nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O')
    nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O')

    chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks))))

    # compute overall precision, recall and FB1 (default values are 0.0)
    prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks)
    res = (prec, rec, f1)
    if not verbose:
        return res

    # print overall performance, and performance per chunk type
    
    print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='')
    print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='')
        
    print("accuracy: %6.2f%%; (non-O)" % (100*nonO_correct_counts/nonO_true_counts))
    print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='')
    print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1))

    # for each chunk type, compute precision, recall and FB1 (default values are 0.0)
    for t in chunk_types:
        prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t])
        print("%17s: " %t , end='')
        print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" %
                    (prec, rec, f1), end='')
        print("  %d" % pred_chunks[t])

    return res
    # you can generate LaTeX output for tables like in
    # http://cnts.uia.ac.be/conll2003/ner/example.tex
    # but I'm not implementing this
    
# result = get_result(correct_chunks, true_chunks, pred_chunks,
#         correct_counts, true_counts, pred_counts)

In [41]:
def evaluate(true_seqs, pred_seqs, verbose=True):
    (correct_chunks, true_chunks, pred_chunks,
        correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)
    result = get_result(correct_chunks, true_chunks, pred_chunks,
        correct_counts, true_counts, pred_counts, verbose=verbose)
    return result

evaluate(real_tags, predicted_tags, True)

processed 1639707 tokens with 419404 phrases; found: 419591 phrases; correct: 419217.
accuracy:  99.98%; (non-O)
accuracy:  99.99%; precision:  99.91%; recall:  99.96%; FB1:  99.93
           PLAYER: precision:  99.82%; recall:  99.91%; FB1:  99.87  209889
             STAT: precision: 100.00%; recall: 100.00%; FB1: 100.00  209702


(99.91086558100626, 99.95541291928546, 99.9331342856632)