In [1]:
import json
# from src.flair_ner import flair_batch_predict, save_as_flair_fmt, load_flair_corpus, train_flair_ner, load_flair_ner

with open("data/train.json") as f:
    train = json.load(f)

with open("data/test.json") as f:
    test = json.load(f)

# Random model

It's always a good idea to start with a non-sense model to make sure evaluation is working and set a baseline.

If our approaches have similar results to the non-sense model, then something might be wrong.

In [2]:
import random
from src.utils import extract_spans, evaluate_ner

labels_to_predict = ['O', 'B-SVC', 'B-ENV']

random.seed(42)
random_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([random.choice(labels_to_predict) for _ in range(len(data['tokens']))])
    }
    for data in test
]

random_predictions[:2]


[{'token': ['Tempo', 'Prod', 'Writes'], 'labels': [(0, 1, 'ENV')]},
 {'token': ['Cortex', 'us-prod', 'full', 'outage'],
  'labels': [(0, 1, 'ENV'), (1, 2, 'SVC')]}]

In [3]:
random_score = evaluate_ner(test, random_predictions)

              precision    recall  f1-score   support

         ENV     0.1250    0.2500    0.1667         8
         SVC     0.2500    0.2000    0.2222        10

   micro avg     0.1667    0.2222    0.1905        18
   macro avg     0.1875    0.2250    0.1944        18
weighted avg     0.1944    0.2222    0.1975        18



# Regex model

Simple model like regex or using heuristic can work surprisingly well sometimes. Especially in cases where we don't have sufficient data.

In this example, we don't actually need a training data since we should be able to just generate a list of services and environments we have.

In [4]:
from src.regex_ner import RegexModel

# get list of services and environments
services = set()
environments = set()

for data in train:
    for span in data['labels']:
        span_text = " ".join(data['tokens'][span[0]:span[1]]) 
        if span[2] == 'SVC':
            services.add(span_text)
        elif span[2] == 'ENV':
            environments.add(span_text)

services, environments

({'Alerting',
  'Alertmanager',
  'Billing',
  'Cloud Alerting',
  'CloudNAT',
  'CloudSQL',
  'Codecov',
  'Cortex',
  'CortexTank',
  'Drone',
  'Enterprise',
  'GEM',
  'Grafana',
  'Grafanacom',
  'Grafanas',
  'Graphite',
  'HG',
  'HG CloudSQL',
  'Hosted Grafana',
  'Ingester',
  'Logs',
  'Loki',
  'ML',
  'Machine Learning',
  'Metamonitoring',
  'Metrictank',
  'Mimir',
  'OnCall',
  'Prometheus',
  'Remote Write',
  'Ruler',
  'SM',
  'SMTP Proxy',
  'Stack Driver NAT',
  'Stack State',
  'Stacks',
  'Synthetic Monitoring',
  'Tempo',
  'WorldPing',
  'billing',
  'cortex',
  'cortex-blocks',
  'grafana com',
  'grafana-com',
  'ingester',
  'loki',
  'ruler',
  'synthetic-monitoring'},
 {'Azure',
  'EU',
  'Prod',
  'Production',
  'billing',
  'cortex-dedicated-08',
  'cortex-prod-04',
  'cortex-prod-10',
  'free cluster',
  'loki-prod',
  'loki-prod-3',
  'ops',
  'prod',
  'prod-eu-west-0',
  'prod-eu-west-0/loki-prod',
  'prod-us-central',
  'prod-us-central-0',
  'us-c

In [5]:

regex_model = RegexModel()
regex_model.fit(services, environments)

regex_bio = regex_model.predict(test)

regex_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([l[1] for l in label])
    }
    for data, label in zip(test, regex_bio)
]

regex_score = evaluate_ner(test, regex_predictions)


Tempo
['Tempo'] 0 0 Tempo Prod Writes
Prod
['Prod'] 1 1 Tempo Prod Writes
Cortex
['Cortex'] 0 0 Cortex us-prod full outage
prod
['prod'] 2 2 Cortex us-prod full outage
loki
['loki'] 0 0 loki-prod3 write outage
Alertmanager
['Alertmanager'] 0 0 Alertmanager UI not working
loki
['loki'] 0 0 loki ops ingester panics
ingester
['ingester'] 2 2 loki ops ingester panics
ops
['ops'] 1 1 loki ops ingester panics
us-central1
['us-central1'] 0 0 us-central1 prod checkpoint failed
prod
['prod'] 1 1 us-central1 prod checkpoint failed
Billing
['Billing'] 0 0 Billing Prometheus Down
Prometheus
['Prometheus'] 1 1 Billing Prometheus Down
Billing
['Billing'] 0 0 Billing Prometheus Down
cortex
['cortex'] 0 0 cortex-dedicated-04 write path
SM
['SM'] 0 0 SM Vultr Probe Outage
Cortex
['Cortex'] 0 0 Cortex Billing Ruler Issues
Billing
['Billing'] 1 1 Cortex Billing Ruler Issues
Ruler
['Ruler'] 2 2 Cortex Billing Ruler Issues
Billing
['Billing'] 1 1 Cortex Billing Ruler Issues
Alertmanager
['Alertmanager'] 0 

# Deep Learning model

This is a transformers based model that is similar to the technologies behind LLMs/Generative AI.

In [6]:
from src.flair_ner import flair_batch_predict, save_as_flair_fmt, load_flair_corpus, load_flair_ner, train_flair_ner

# save the data in flair format
save_as_flair_fmt(train, "data/train.txt")
save_as_flair_fmt(test, "data/test.txt")

corpus = load_flair_corpus("data")


2024-06-03 21:50:33,182 Reading data from data
2024-06-03 21:50:33,183 Train: data/train.txt
2024-06-03 21:50:33,184 Dev: None
2024-06-03 21:50:33,184 Test: data/test.txt
2024-06-03 21:50:33,188 No dev split found. Using 0% (i.e. 10 samples) of the train split as dev data


In [7]:
save_path = "models/flair-ner"

# Train the model
# train_flair_ner(corpus, save_path, max_epochs=200, base_model="prajjwal1/bert-mini")

# get pre-trained model from Google Cloud Storage
# !wget https://storage.googleapis.com/edwardqian-dev/workshop/best-model.pt -o f"{save_path}/best-model.pt"

In [8]:
flair_model = load_flair_ner(f"{save_path}/best-model.pt")

from flair.data import Sentence

test_sentences = [Sentence(" ".join(d["tokens"])) for d in test]
flair_pred = flair_batch_predict(flair_model, test_sentences,  batch_size = 1)

flair_score = evaluate_ner(test, flair_pred)

2024-06-03 21:50:33,611 SequenceTagger predicts: Dictionary with 9 tags: O, S-SVC, B-SVC, E-SVC, I-SVC, S-ENV, B-ENV, E-ENV, I-ENV
              precision    recall  f1-score   support

         ENV     0.5000    0.2500    0.3333         8
         SVC     0.6000    0.9000    0.7200        10

   micro avg     0.5789    0.6111    0.5946        18
   macro avg     0.5500    0.5750    0.5267        18
weighted avg     0.5556    0.6111    0.5481        18



# Generative AI Model

What if we use generative AI to solve this problem?

In [9]:
# from src.gpt_ner import gpt_ner
# gpt_bio = gpt_ner(str([" ".join(d["tokens"]) for d in test]), model="gpt-4-turbo")


gpt_bio = [['B-SVC', 'I-SVC', 'O', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'I-SVC', 'B-ENV', 'I-ENV', 'I-ENV', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'I-ENV', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'I-SVC', 'O']]

In [10]:
gpt_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans(label)
    }
    for data, label in zip(test, gpt_bio)
]
gpt_predictions

gpt_score = evaluate_ner(test, gpt_predictions)

              precision    recall  f1-score   support

         ENV     0.0000    0.0000    0.0000       8.0
         SVC     0.0000    0.0000    0.0000      10.0

   micro avg     0.0000    0.0000    0.0000      18.0
   macro avg     0.0000    0.0000    0.0000      18.0
weighted avg     0.0000    0.0000    0.0000      18.0



In [11]:
# Looking at all the results

print("Random Model")
print(random_score)

print("Regex Model")
print(regex_score)

print("Flair Model")
print(flair_score)

print("GPT Model")
print(gpt_score)

Random Model
              precision    recall  f1-score   support

         ENV     0.1250    0.2500    0.1667         8
         SVC     0.2500    0.2000    0.2222        10

   micro avg     0.1667    0.2222    0.1905        18
   macro avg     0.1875    0.2250    0.1944        18
weighted avg     0.1944    0.2222    0.1975        18

Regex Model
              precision    recall  f1-score   support

         ENV     0.5714    0.5000    0.5333         8
         SVC     0.8333    1.0000    0.9091        10

   micro avg     0.7368    0.7778    0.7568        18
   macro avg     0.7024    0.7500    0.7212        18
weighted avg     0.7169    0.7778    0.7421        18

Flair Model
              precision    recall  f1-score   support

         ENV     0.5000    0.2500    0.3333         8
         SVC     0.6000    0.9000    0.7200        10

   micro avg     0.5789    0.6111    0.5946        18
   macro avg     0.5500    0.5750    0.5267        18
weighted avg     0.5556    0.6111    

## Inspect the incorrect predictions

Looking at scores are great, but they don't tell the whole story.

It is ALWAYS a good idea to look at the actual data.

In [12]:
# Looking at some sample incorrect predictions

def print_sample_errors(true, pred, n=5):
    for i, (t, p) in enumerate(zip(true, pred)):
        t_spans = []
        p_spans = []

        for tl in t["labels"]:
            t_spans.append(f"{' '.join(t['tokens'][tl[0]:tl[1]])} ({tl[2]})")
        for pl in p["labels"]:
            p_spans.append(f"{' '.join(t['tokens'][pl[0]:pl[1]])} ({pl[2]})")

        if sorted(t_spans) != sorted(p_spans):
            print(f"Text: {' '.join(t['tokens'])}")
            print(f"True: {t_spans}")
            print(f"Pred: {p_spans}")
            print()
            n -= 1
        if n == 0:
            break

print("regex_model")
print_sample_errors(test, regex_predictions)
print("flair_model")
print_sample_errors(test, flair_pred)
print("gpt_model")
print_sample_errors(test, gpt_predictions)

regex_model
Text: Cortex us-prod full outage
True: ['Cortex (SVC)', 'us-prod (ENV)']
Pred: ['Cortex (SVC)', 'full (ENV)']

Text: loki-prod3 write outage
True: ['loki-prod3 (ENV)']
Pred: ['loki-prod3 (SVC)']

Text: us-central1 prod checkpoint failed
True: ['us-central1 prod (ENV)']
Pred: ['us-central1 (ENV)', 'prod (ENV)']

Text: cortex-dedicated-04 write path
True: ['cortex-dedicated-04 (ENV)']
Pred: ['cortex-dedicated-04 (SVC)']

flair_model
Text: loki-prod3 write outage
True: ['loki-prod3 (ENV)']
Pred: ['loki-prod3 (SVC)']

Text: loki ops ingester panics
True: ['loki (SVC)', 'ops (ENV)', 'ingester (SVC)']
Pred: ['loki (SVC)', 'ops (SVC)']

Text: us-central1 prod checkpoint failed
True: ['us-central1 prod (ENV)']
Pred: ['us-central1 (ENV)', 'prod (ENV)']

Text: Billing Prometheus Down
True: ['Billing (ENV)', 'Prometheus (SVC)']
Pred: ['Billing (SVC)', 'Prometheus (SVC)']

Text: cortex-dedicated-04 write path
True: ['cortex-dedicated-04 (ENV)']
Pred: ['cortex-dedicated-04 (SVC)']

gpt_

## Out of distribution data

Our previous test data is randomly split from the training data.

This data is good for development, but is not a true measure of data that the model has not seen before.

Let's see what happens if we evalute our model on a completely different type of data.

In [13]:
out_of_dist_labels = [
    {"tokens": ["Kube", "API", "Server", "Down", "in", "preprod"],
     "labels": [(0, 2, "SVC"), (5, 6, "ENV")]},
    {"tokens": ["Metrics", "Server", "Scaling", "Issues", "for", "test-na-4"],
     "labels": [(0, 2, "SVC"), (5, 6, "ENV")]},
    {"tokens": ["Redshift", "me-south-3", "Data", "Corruption"],
     "labels": [(0, 1, "SVC"), (1, 2, "ENV")]},
    {"tokens": ["Elasticsearch", "Cluster", "Overload"],
     "labels": [(0, 1, "SVC")]},
    {"tokens": ["Kafka", "Stream", "Delays", "in", "sandbox", "env"],
     "labels": [(0, 1, "SVC"), (4, 5, "ENV")]},
    {"tokens": ["Redis", "Cache", "Failure", "in", "prod-2,", "dev,", "and", "staging"],
     "labels": [(0, 2, "SVC"), (4, 5, "ENV"), (5, 6, "ENV"), (7, 8, "ENV")]},
    {"tokens": ["MongoDB", "Backup", "Failures"], "labels": [(0, 1, "SVC")]},
    {"tokens": ["Nginx", "Proxy", "Timeout"], "labels": [(0, 2, "SVC")]},
    {"tokens": ["Docker", "Registry", "Unreachable"], "labels": [(0, 2, "SVC")]},
    {"tokens": ["GitLab", "CI", "Pipeline", "Stuck"], "labels": [(0, 2, "SVC")]}
]

In [14]:
# Regex model
regex_ood_bio = regex_model.predict(out_of_dist_labels)
regex_ood_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans([l[1] for l in label])
    }
    for data, label in zip(out_of_dist_labels, regex_ood_bio)
]
ood_regex_score = evaluate_ner(out_of_dist_labels, regex_ood_predictions)

prod
['prod'] 4 4 Redis Cache Failure in prod-2, dev, and staging
              precision    recall  f1-score   support

         SVC     0.0000    0.0000    0.0000        10
         ENV     1.0000    0.1429    0.2500         7

   micro avg     1.0000    0.0588    0.1111        17
   macro avg     0.5000    0.0714    0.1250        17
weighted avg     0.4118    0.0588    0.1029        17



In [15]:
from flair.data import Sentence
ood_flair_labels = [Sentence(" ".join(d['tokens'])) for d in out_of_dist_labels]
ood_flair_pred = flair_batch_predict(flair_model, ood_flair_labels, batch_size=4)
ood_flair_score = evaluate_ner(out_of_dist_labels, ood_flair_pred)

              precision    recall  f1-score   support

         ENV     1.0000    0.1429    0.2500         7
         SVC     0.2857    0.2000    0.2353        10

   micro avg     0.3750    0.1765    0.2400        17
   macro avg     0.6429    0.1714    0.2426        17
weighted avg     0.5798    0.1765    0.2413        17



In [16]:
# from src.gpt_ner import gpt_ner
# ood_gpt_bio = gpt_ner(str([" ".join(d["tokens"]) for d in out_of_dist_labels]), model="gpt-4-turbo")
ood_gpt_bio = [['B-SVC', 'I-SVC', 'I-SVC', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'O', 'B-ENV'],
 ['B-SVC', 'B-ENV', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'B-ENV', 'I-ENV'],
 ['B-SVC', 'I-SVC', 'O', 'O', 'B-ENV', 'B-ENV', 'B-ENV', 'B-ENV'],
 ['B-SVC', 'O', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'O'],
 ['B-SVC', 'I-SVC', 'I-SVC', 'O']]

ood_gpt_predictions = [
    {
        "token": data['tokens'],
        "labels": extract_spans(label)
    }
    for data, label in zip(out_of_dist_labels, ood_gpt_bio)
]
ood_gpt_score = evaluate_ner(out_of_dist_labels, ood_gpt_predictions)

              precision    recall  f1-score   support

         ENV     0.7500    0.8571    0.8000         7
         SVC     0.6000    0.6000    0.6000        10

   micro avg     0.6667    0.7059    0.6857        17
   macro avg     0.6750    0.7286    0.7000        17
weighted avg     0.6618    0.7059    0.6824        17



In [17]:
# Looking at some sample incorrect predictions

print("regex_model")
print_sample_errors(out_of_dist_labels, regex_ood_predictions)
print("flair_model")
print_sample_errors(out_of_dist_labels, ood_flair_pred)
print("gpt4")
print_sample_errors(out_of_dist_labels, ood_gpt_predictions)

regex_model
Text: Kube API Server Down in preprod
True: ['Kube API (SVC)', 'preprod (ENV)']
Pred: []

Text: Metrics Server Scaling Issues for test-na-4
True: ['Metrics Server (SVC)', 'test-na-4 (ENV)']
Pred: []

Text: Redshift me-south-3 Data Corruption
True: ['Redshift (SVC)', 'me-south-3 (ENV)']
Pred: []

Text: Elasticsearch Cluster Overload
True: ['Elasticsearch (SVC)']
Pred: []

Text: Kafka Stream Delays in sandbox env
True: ['Kafka (SVC)', 'sandbox (ENV)']
Pred: []

flair_model
Text: Kube API Server Down in preprod
True: ['Kube API (SVC)', 'preprod (ENV)']
Pred: ['Kube (SVC)']

Text: Metrics Server Scaling Issues for test-na-4
True: ['Metrics Server (SVC)', 'test-na-4 (ENV)']
Pred: ['Metrics (SVC)']

Text: Redshift me-south-3 Data Corruption
True: ['Redshift (SVC)', 'me-south-3 (ENV)']
Pred: []

Text: Kafka Stream Delays in sandbox env
True: ['Kafka (SVC)', 'sandbox (ENV)']
Pred: ['Kafka (SVC)']

Text: Redis Cache Failure in prod-2, dev, and staging
True: ['Redis Cache (SVC)', 'pr