# Getting labelled terms from Argilla, to improve explorer inputs

Unfortunately explorer_metrics.ipynb stopped working, so here we just get a unique list of the terms to cross-reference instead.



In [1]:
import os
from collections import defaultdict
from pathlib import Path

import argilla as rg
from dotenv import load_dotenv, find_dotenv
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv(find_dotenv(), override=True)

nlp = spacy.blank("en")

concepts = ["climate-related-hazards", "renewables"]

DATASET_NAMES = [f"explorer-quality-testing-{concept}" for concept in concepts]

rg.init(workspace="gst", api_key=os.environ["ARGILLA_API_KEY"])

rg_datasets = dict()
all_records = list()

for concept, _dataset in zip(concepts, DATASET_NAMES):
    rg_datasets[concept] = rg.load(_dataset, query="status:Validated")
    all_records.extend(rg_datasets[concept]._records)

rg_dataset = rg.DatasetForTokenClassification(all_records)
len(rg_dataset)

374

In [3]:
def get_annotation_text(rg_dataset) -> dict:
    """
    Get unique annotations (not case-sensitive) for each label in the dataset.

    Dataset is keyed by label.
    """
    results = defaultdict(list)

    for record in rg_dataset:
        record_text = record.text
        for annotation in record.annotation:
            label = annotation[0]
            start, end = annotation[1], annotation[2]
            text = record_text[start:end]
            results[label].append(text.lower().rstrip(".,;:-?"))

    results = {k: sorted(list(set(v))) for k, v in results.items()}

    return results


for concept in concepts:
    terms_list = get_annotation_text(rg_datasets[concept])[concept]
    Path(f"../../concepts/{concept}/terms_from_labelling_{20231125}.txt").write_text(
        "\n".join(terms_list)
    )