# Calculating metrics for explorer runs

Using [nervaluate](https://github.com/MantisAI/nervaluate).

In [1]:
import sys

!{sys.executable} -m pip install nervaluate

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import ast
import glob
import os
import hashlib
import itertools
from collections import defaultdict

import argilla as rg
import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import spacy
import math
from functools import partial
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Create mapping from old to new IDs
mapping_path = "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/notebooks/old-to-new-dataset-mapping.csv"
mapping_df = pd.read_csv(mapping_path)

ids_mapping = {
    row["document_id_old"]: row["document_id_new"] for _, row in mapping_df.iterrows()
}

In [4]:
load_dotenv(find_dotenv())

nlp = spacy.blank("en")

DATASET_NAME = "explorer-quality-testing"

rg.init(workspace="gst", api_key=os.environ["ARGILLA_API_KEY"])
rg_dataset = rg.load(DATASET_NAME, query="status:Validated")

# List of subdirectories to include
concepts = [
    "financial-flows",
    "deforestation",
    "vulnerable-groups",
    "equity-and-just-transition",
    "barriers-and-challenges",
    "good-practice-and-opportunities",
]

type_mapping = {
    "Good Practices and Opportunities": "Good Practice And Opportunities",
    "Barriers and Challenges": "Barriers And Challenges",
    "Equity And Justice": "Equity And Just Transition",
}


def map_type(old_type: str) -> str:
    """Map type used in Argilla to any renamed types since the Argilla load"""
    if old_type not in type_mapping:
        return old_type

    return type_mapping[old_type]


def map_record_id(record):
    """Modify a TokenClassificationRecord in place to use new IDs"""

    new_id = ids_mapping.get(record.metadata["document_id"])

    if new_id is None:
        # print(f"could not find ID {record.metadata['document_id']} in mapping")

        return None

    record.metadata["document_id"] = new_id

    return record


mapped_documents = 0

for idx, document in enumerate(rg_dataset):
    if document.annotation:
        document.annotation = [
            (map_type(item[0]), item[1], item[2]) for item in document.annotation
        ]

    # # FIXME: we exclude annotations from Joe here as his labelling style was a bit different
    # if document.annotation_agent == "joe":
    #     document.metadata["document_id"] = "EXCLUDED"

    if map_record_id(document) is None:
        continue

    else:
        document = map_record_id(document)
        mapped_documents += 1


print(f"{mapped_documents}/{len(rg_dataset)} documents successfully mapped ids")

759/785 documents successfully mapped ids


## Load CPR dataset and add span annotations to token classification records

In [5]:
import sys

sys.path.append("../..")

from pathlib import Path

from src.opensearch.index_data import get_dataset_and_filter_values

In [6]:
cpr_dataset, _ = get_dataset_and_filter_values(
    Path(os.environ["DOCS_DIR_GST"]),
    Path(
        "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/CPR_UNFCCC_MASTER.csv"
    ),
    Path("../../concepts").absolute(),
    limit=None,
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1697/1697 [01:13<00:00, 23.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1679/1679 [02:00<00:00, 13.97it/s]
1558docs [00:57, 27.05docs/s]


In [7]:
# create hash map between IDs and spans

SPAN_TYPES = {
    "Financial Flows",
    "Deforestation",
    "Vulnerable Groups",
    "Equity And Just Transition",
    "Barriers And Challenges",
    "Good Practice And Opportunities",
}

doc_id_text_hash_span_map = dict()

for doc in tqdm(cpr_dataset.documents):
    if doc.text_blocks is None:
        continue

    for block in doc.text_blocks:
        # We have to recreate the block text hash here as hashes aren't stored in argilla
        # and we modify the text on argilla data load. This is replicating the transformation
        # that happens in Argilla
        text = block.to_string().replace("\n", " ").replace("  ", " ")
        text_hash = hashlib.md5(text.encode("utf-8")).hexdigest()

        for span in block.spans:
            if span.type.split("–")[0].strip() not in SPAN_TYPES:
                continue

            span.type = span.type.split("–")[0].strip()

            doc_span_id = f"{span.document_id}_{text_hash}"

            if doc_span_id in doc_id_text_hash_span_map:
                doc_id_text_hash_span_map[doc_span_id].append(span)

            else:
                doc_id_text_hash_span_map[doc_span_id] = [span]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1679/1679 [00:11<00:00, 143.36it/s]


In [8]:
# add predictions to each Argilla record

# Maps span type to records which predict only for that span type to deal with the fact that Argilla can't handle overlaps
prediction_records = defaultdict(list)

argilla_blocks_found_in_cpr_dataset = 0

for record in tqdm(rg_dataset):
    text_hash = hashlib.md5(record.text.encode("utf-8")).hexdigest()
    doc_span_id = f"{record.metadata['document_id']}_{text_hash}"

    if doc_span_id in doc_id_text_hash_span_map:
        argilla_blocks_found_in_cpr_dataset += 1

        predictions = [
            (span.type, span.start_idx, span.end_idx)
            for span in doc_id_text_hash_span_map[doc_span_id]
        ]
        predictions = sorted(predictions, key=lambda x: x[0])

        # add all predictions to argilla record
        record.prediction = predictions

        # create temp argilla records per span type
        for span_type, span_predictions in itertools.groupby(
            predictions, lambda x: x[0]
        ):
            record_copy = record.copy()
            record_copy.prediction = list(span_predictions)
            prediction_records[span_type].append(record_copy)

print(f"Argilla blocks found in CPR dataset: {argilla_blocks_found_in_cpr_dataset}")
support = {k: len(v) for k, v in prediction_records.items()}
support

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 785/785 [00:00<00:00, 17705.76it/s]

Argilla blocks found in CPR dataset: 154





{'Good Practice And Opportunities': 38,
 'Barriers And Challenges': 39,
 'Deforestation': 39,
 'Equity And Just Transition': 21,
 'Financial Flows': 43,
 'Vulnerable Groups': 20}

## Use nervaluate for performance stats measurement

In [9]:
from nervaluate import Evaluator

In [10]:
def argilla_to_nervaluate(record):
    return [{"label": item[0], "start": item[1], "end": item[2]} for item in record]


ground_truth = [argilla_to_nervaluate(record.annotation) for record in rg_dataset]
predictions = [argilla_to_nervaluate(record.prediction or []) for record in rg_dataset]

In [11]:
evaluator = Evaluator(ground_truth, predictions, SPAN_TYPES)
results, results_per_tag = evaluator.evaluate()

### Calculate individual errors (per text block)

In [12]:
def overlaps(a, b) -> bool:
    """Whether a and b have any overlap"""
    if a["start"] > b["start"]:
        return overlaps(b, a)
    assert a["start"] <= b["start"]
    return a["end"] >= b["start"]


instance_level_results = []

for idx in range(len(evaluator.true)):
    # each item in evaluator.true and evaluator.pred is a list of dictionaries with the following structure
    # {'label': 'Good Practice And Opportunities', 'start': 16, 'end': 29}

    item_results = defaultdict(list)

    for truth, pred in itertools.product(evaluator.true[idx], evaluator.pred[idx]):
        if overlaps(truth, pred) and truth["label"] != pred["label"]:
            item_results["incorrect"].append({"true": truth, "pred": pred})

    for truth in evaluator.true[idx]:
        if not any(
            (overlaps(truth, pred) and truth["label"] == pred["label"])
            for pred in evaluator.pred[idx]
        ):
            item_results["missed"].append(truth)

    instance_level_results.append(item_results)

In [13]:
# incorrect results - misclassified.
# note sometimes these could be to do with the fact that we can't label overlapping spans in Argilla
incorrect_results = [
    (rg_dataset[idx], result["incorrect"])
    for (idx, result) in enumerate(instance_level_results)
    if "incorrect" in result
]
print(
    str(len(incorrect_results)) + " text passages with 1 or more incorrect results \n"
)


def format_incorrect_spans(incorrect_spans) -> str:
    formatted_spans = []

    for item in incorrect_spans:
        formatted_spans.append(
            f"""- true: {item["true"]}
- predicted: {item["pred"]}
"""
        )

    return "\n".join(formatted_spans)


for rg_record, incorrect_spans in incorrect_results:
    # print(f"id: {rg_record.id}")
    print(rg_record.text)
    print(
        f"""```
{format_incorrect_spans(incorrect_spans)}
```"""
    )
    print()

12 text passages with 1 or more incorrect results 

We agree with the IPCC's Sixth Assessment report that "Any further delay in concerted anticipatory global action on adaptation and mitigation will miss a brief and rapidly closing window of opportunity to secure a liveable and sustainable future for all." Urgent change is required in how we produce our electricity, heat our homes and travel.
```
- true: {'label': 'Barriers And Challenges', 'start': 55, 'end': 253}
- predicted: {'label': 'Good Practice And Opportunities', 'start': 190, 'end': 201}

```

UNEP adaptation gap report suggested adaptation costs "in developing countries alone" could reach $140-300bn in 2030, which echoes language in the Paris Agreement urging a "balance between adaptation and mitigation" finance. Extreme weather events and unimaginable destructions throughout the world are clear proof there is more we should do other than mitigation and adaptation i.e., recognize and compensate for loss and damage. There hav

In [14]:
# missed results - in labels but with no annotation
missed_results = [
    {"text": rg_dataset[idx].text, "spans": result["missed"]}
    for (idx, result) in enumerate(instance_level_results)
    if "missed" in result
]

for result in missed_results:
    span_label_text_mapping = defaultdict(list)

    for span in result["spans"]:
        span_label_text_mapping[span["label"]].append(
            result["text"][span["start"] : span["end"]]
        )

    # span_text = [result['text'][span['start']:span['end']] for span in result["spans"]]
    result["span_text"] = {k: "|".join(v) for k, v in span_label_text_mapping.items()}


missing_df = pd.DataFrame(missed_results).drop(columns={"spans"})
missing_df = pd.concat(
    [missing_df, missing_df["span_text"].apply(pd.Series).fillna("")], axis=1
).drop(columns={"span_text"})

missing_df.to_csv("./data/missing_explorer_annotations_post_bonn.csv", index=False)

### Calculate metrics per class

In [15]:
# We only want ent_type (any overlap between ground truth and prediction for an entity) or exact (exact boundaries and matching type) for each span type

ner_results_ent_type = dict()
ner_results_strict = dict()

for span_type, results in results_per_tag.items():
    ner_results_ent_type[span_type] = results["ent_type"]
    ner_results_strict[span_type] = results["strict"]

print("Results - any overlap between prediction and actual")
results_df_ent_type = pd.DataFrame(ner_results_ent_type).T
results_df_ent_type["support"] = pd.Series(support)
display(results_df_ent_type.round(2))

print()
print("Results, strict - exact boundary match between prediction and actual")
results_df_strict = pd.DataFrame(ner_results_strict).T
results_df_strict["support"] = pd.Series(support)
display(results_df_strict.round(2))

Results - any overlap between prediction and actual


Unnamed: 0,correct,incorrect,partial,missed,spurious,possible,actual,precision,recall,f1,support
Financial Flows,40.0,1.0,0.0,46.0,102.0,87.0,143.0,0.28,0.46,0.35,43
Deforestation,62.0,1.0,0.0,40.0,37.0,103.0,100.0,0.62,0.6,0.61,39
Barriers And Challenges,27.0,3.0,0.0,25.0,20.0,55.0,50.0,0.54,0.49,0.51,39
Equity And Just Transition,28.0,10.0,0.0,46.0,7.0,84.0,45.0,0.62,0.33,0.43,21
Good Practice And Opportunities,30.0,0.0,0.0,25.0,20.0,55.0,50.0,0.6,0.55,0.57,38
Vulnerable Groups,19.0,0.0,0.0,28.0,5.0,47.0,24.0,0.79,0.4,0.54,20



Results, strict - exact boundary match between prediction and actual


Unnamed: 0,correct,incorrect,partial,missed,spurious,possible,actual,precision,recall,f1,support
Financial Flows,8.0,33.0,0.0,46.0,102.0,87.0,143.0,0.06,0.09,0.07,43
Deforestation,26.0,37.0,0.0,40.0,37.0,103.0,100.0,0.26,0.25,0.26,39
Barriers And Challenges,12.0,18.0,0.0,25.0,20.0,55.0,50.0,0.24,0.22,0.23,39
Equity And Just Transition,14.0,24.0,0.0,46.0,7.0,84.0,45.0,0.31,0.17,0.22,21
Good Practice And Opportunities,20.0,10.0,0.0,25.0,20.0,55.0,50.0,0.4,0.36,0.38,38
Vulnerable Groups,9.0,10.0,0.0,28.0,5.0,47.0,24.0,0.38,0.19,0.25,20


### Print results to tables for methodology

`ent_type`: precision, recall, f1, support

In [16]:
for concept, row in pd.DataFrame(results_df_ent_type).sort_index().iterrows():
    concept_table = row[["precision", "recall", "f1", "support"]].rename(
        lambda i: i.title()
    )
    concept_table[["Precision", "Recall", "F1"]] = concept_table[
        ["Precision", "Recall", "F1"]
    ].round(2)

    print(concept)
    print(pd.DataFrame(concept_table).T.to_markdown(index=False))

Barriers And Challenges
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|        0.54 |     0.49 | 0.51 |        39 |
Deforestation
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|        0.62 |      0.6 | 0.61 |        39 |
Equity And Just Transition
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|        0.62 |     0.33 | 0.43 |        21 |
Financial Flows
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|        0.28 |     0.46 | 0.35 |        43 |
Good Practice And Opportunities
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|         0.6 |     0.55 | 0.57 |        38 |
Vulnerable Groups
|   Precision |   Recall |   F1 |   Support |
|------------:|---------:|-----:|----------:|
|        0.79 |      0.4 | 0.54 |        20 |
