In [None]:
import json
# from src.flair_ner import flair_batch_predict, save_as_flair_fmt, load_flair_corpus, train_flair_ner, load_flair_ner

with open("data/train.json") as f:
    train = json.load(f)

with open("data/test.json") as f:
    test = json.load(f)

# Random model

It's always a good idea to start with a non-sense model to make sure evaluation is working and set a baseline.

If our approaches have similar results to the non-sense model, then something might be wrong.

In [None]:
import random
from src.utils import extract_spans, evaluate_ner

labels_to_predict = ['O', 'B-SVC', 'B-ENV']

random.seed(42)
random_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([random.choice(labels_to_predict) for _ in range(len(data['tokens']))])
    }
    for data in test
]

random_predictions[:2]


In [None]:
random_score = evaluate_ner(test, random_predictions)

# Regex model

Simple model like regex or using heuristic can work surprisingly well sometimes. Especially in cases where we don't have sufficient data.

In this example, we don't actually need a training data since we should be able to just generate a list of services and environments we have.

In [None]:
from src.regex_ner import RegexModel

# get list of services and environments
services = set()
environments = set()

for data in train:
    for span in data['labels']:
        span_text = " ".join(data['tokens'][span[0]:span[1]]) 
        if span[2] == 'SVC':
            services.add(span_text)
        elif span[2] == 'ENV':
            environments.add(span_text)

services, environments

In [None]:

regex_model = RegexModel()
regex_model.fit(services, environments)

regex_bio = regex_model.predict(test)

regex_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([l[1] for l in label])
    }
    for data, label in zip(test, regex_bio)
]

regex_score = evaluate_ner(test, regex_predictions)


# Deep Learning model

This is a transformers based model that is similar to the technologies behind LLMs/Generative AI.

In [None]:
from src.flair_ner import flair_batch_predict, save_as_flair_fmt, load_flair_corpus, load_flair_ner, train_flair_ner

# save the data in flair format
save_as_flair_fmt(train, "data/train.txt")
save_as_flair_fmt(test, "data/test.txt")

corpus = load_flair_corpus("data")


In [None]:
save_path = "models/flair-ner"

# Train the model
# train_flair_ner(corpus, save_path, max_epochs=200, base_model="prajjwal1/bert-mini")

# get pre-trained model from Google Cloud Storage
# !wget https://storage.googleapis.com/edwardqian-dev/workshop/best-model.pt -o f"{save_path}/best-model.pt"

In [None]:
flair_model = load_flair_ner(f"{save_path}/best-model.pt")

from flair.data import Sentence

test_sentences = [Sentence(" ".join(d["tokens"])) for d in test]
flair_pred = flair_batch_predict(flair_model, test_sentences,  batch_size = 1)

flair_score = evaluate_ner(test, flair_pred)

# Generative AI Model

What if we use generative AI to solve this problem?

In [None]:
# from src.gpt_ner import gpt_ner
# gpt_bio = gpt_ner(str([" ".join(d["tokens"]) for d in test]), model="gpt-4-turbo")


gpt_bio = [['B-SVC', 'I-SVC', 'O', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'I-SVC', 'B-ENV', 'I-ENV', 'I-ENV', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'I-ENV', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O']]

In [None]:
gpt_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans(label)
    }
    for data, label in zip(test, gpt_bio)
]
gpt_predictions

gpt_score = evaluate_ner(test, gpt_predictions)

In [None]:
# Looking at all the results

print("Random Model")
print(random_score)

print("Regex Model")
print(regex_score)

print("Flair Model")
print(flair_score)

print("GPT Model")
print(gpt_score)

## Inspect the incorrect predictions

Looking at scores are great, but they don't tell the whole story.

It is ALWAYS a good idea to look at the actual data.

In [None]:
# Looking at some sample incorrect predictions

def print_sample_errors(true, pred, n=5):
    for i, (t, p) in enumerate(zip(true, pred)):
        t_spans = []
        p_spans = []

        for tl in t["labels"]:
            t_spans.append(f"{' '.join(t['tokens'][tl[0]:tl[1]])} ({tl[2]})")
        for pl in p["labels"]:
            p_spans.append(f"{' '.join(t['tokens'][pl[0]:pl[1]])} ({pl[2]})")

        if sorted(t_spans) != sorted(p_spans):
            print(f"Text: {' '.join(t['tokens'])}")
            print(f"True: {t_spans}")
            print(f"Pred: {p_spans}")
            print()
            n -= 1
        if n == 0:
            break

print("regex_model")
print_sample_errors(test, regex_predictions)
print("flair_model")
print_sample_errors(test, flair_pred)
print("gpt_model")
print_sample_errors(test, gpt_predictions)

## Sample production data

Our previous test data is randomly split from the training data.

This data is good for development, but is not a true measure of data that the model has not seen before.

Let's see what happens if we evaluate our model on a completely different dataset.

This data helps us bridge the gap between offline and online predictions.

In [None]:
prod_eval_labels = [
    {"tokens": ["Kube", "API", "Server", "Down", "in", "preprod"],
     "labels": [(0, 2, "SVC"), (5, 6, "ENV")]},
    {"tokens": ["Metrics", "Server", "Scaling", "Issues", "for", "test-na-4"],
     "labels": [(0, 2, "SVC"), (5, 6, "ENV")]},
    {"tokens": ["Redshift", "me-south-3", "Data", "Corruption"],
     "labels": [(0, 1, "SVC"), (1, 2, "ENV")]},
    {"tokens": ["Elasticsearch", "Cluster", "Overload"],
     "labels": [(0, 1, "SVC")]},
    {"tokens": ["Kafka", "Stream", "Delays", "in", "sandbox", "env"],
     "labels": [(0, 1, "SVC"), (4, 5, "ENV")]},
    {"tokens": ["Redis", "Cache", "Failure", "in", "prod-2,", "dev,", "and", "staging"],
     "labels": [(0, 2, "SVC"), (4, 5, "ENV"), (5, 6, "ENV"), (7, 8, "ENV")]},
    {"tokens": ["MongoDB", "Backup", "Failures"], "labels": [(0, 1, "SVC")]},
    {"tokens": ["Nginx", "Proxy", "Timeout"], "labels": [(0, 2, "SVC")]},
    {"tokens": ["Docker", "Registry", "Unreachable"], "labels": [(0, 2, "SVC")]},
    {"tokens": ["GitLab", "CI", "Pipeline", "Stuck"], "labels": [(0, 2, "SVC")]}
]

In [None]:
# Regex model
regex_prod_eval_bio = regex_model.predict(prod_eval_labels)
regex_prod_eval_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([l[1] for l in label])
    }
    for data, label in zip(prod_eval_labels, regex_prod_eval_bio)
]
prod_eval_regex_score = evaluate_ner(prod_eval_labels, regex_prod_eval_predictions)

In [None]:
from flair.data import Sentence
prod_eval_flair_labels = [Sentence(" ".join(d['tokens'])) for d in prod_eval_labels]
prod_eval_flair_pred = flair_batch_predict(flair_model, prod_eval_flair_labels, batch_size=4)
prod_eval_flair_score = evaluate_ner(prod_eval_labels, prod_eval_flair_pred)

In [None]:
# from src.gpt_ner import gpt_ner
# prod_eval_gpt_bio = gpt_ner(str([" ".join(d["tokens"]) for d in prod_eval_labels]), model="gpt-4-turbo")

prod_eval_gpt_bio = [['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'B-ENV', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'B-ENV', 'I-ENV'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'B-ENV', 'B-ENV', 'B-ENV', 'B-ENV'],
 ['B-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O']]

prod_eval_gpt_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans(label)
    }
    for data, label in zip(prod_eval_labels, prod_eval_gpt_bio)
]
prod_eval_gpt_score = evaluate_ner(prod_eval_labels, prod_eval_gpt_predictions)

In [None]:
# Looking at some sample incorrect predictions

print("regex_model")
print_sample_errors(prod_eval_labels, prod_eval_regex_score)
print("flair_model")
print_sample_errors(prod_eval_labels, prod_eval_flair_pred)
print("gpt4")
print_sample_errors(prod_eval_labels, prod_eval_gpt_score)

# Model Selection
## Deciding on the model

1. Regex
This approach is really simple, easy to scale, and can be integrated into any stack fairly easily (we can just port the regex into any language).

Regex did well on the original test set but poorly on the out of distribution set.

2. Flair
This approach strongly prefers a python backend, we need to have specific scaffolding to scale and store the model.

The flair model did a little better than regex on the original test set and much better than the regex on the out of distribution set (although still quite poor).

3. GPT4
This approach requires very actually training data, can be integrated into any stack since it's just a request.

It's considerably more expensive per request and has much higher latency (prediction can take ~10 seconds).

GPT4 did fairly reasonable for the out of distribution dataset but fairly poorly on the original test dataset.

**Question**: what might be some reasons to choose one model over the other?