# Loader for concept annotation task

The task is token classification into one or more of the following categories:

* Financial Flows
* Deforestation
* Vulnerable Groups
* Equity And Justice
* Challenges And Opportunities


In [2]:
import sys

!{sys.executable} -m pip install argilla

Collecting argilla
  Downloading argilla-1.6.0-py3-none-any.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting monotonic
  Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting deprecated~=1.2.0
  Using cached Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting backoff
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting wrapt<1.15,>=1.13
  Using cached wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl (35 kB)
Collecting rich<=13.0.1
  Downloading rich-13.0.1-py3-none-any.whl (238 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.1/238.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting commonmark<0.10.0,>=0.9.0
  Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Installing collected packages: monotonic, commonmark, wrapt, rich

In [1]:
import os
import random

from cpr_data_access.models import Dataset, BaseDocument
from dotenv import load_dotenv, find_dotenv
import argilla as rg
from tqdm.auto import tqdm
import spacy

load_dotenv(find_dotenv())
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# User management is done at a workspace level

DATASET_NAME = "explorer-quality-testing"
TEXT_BLOCKS_PER_DOCUMENT = 2
settings = rg.TokenClassificationSettings(
    label_schema=[
        "Financial Flows",
        "Deforestation",
        "Vulnerable Groups",
        "Equity And Justice",
        "Challenges And Opportunities",
    ]
)

In [3]:
rg.init(workspace="gst", api_key=os.environ["ARGILLA_API_KEY"])

rg.configure_dataset(name=DATASET_NAME, settings=settings)

In [4]:
dataset = Dataset(document_model=BaseDocument).load_from_local(
    os.environ["DOCS_DIR_GST"], limit=None
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1142/1142 [01:08<00:00, 16.74it/s]


In [5]:
records = []

for document in tqdm(dataset.documents):
    if document.text_blocks is None:
        print(f"Skipping {document.document_id} as no text blocks")
        continue

    doc_metadata = document.dict(exclude={"text_blocks", "page_metadata"})

    # Randomly sample a fixed number of text blocks per document
    if len(document.text_blocks) <= TEXT_BLOCKS_PER_DOCUMENT:
        blocks = document.text_blocks
    else:
        blocks = random.sample(document.text_blocks, TEXT_BLOCKS_PER_DOCUMENT)

    for block in blocks:
        block_metadata = block.dict(exclude={"text"})
        block_text = block.to_string().replace("\n", " ").replace("  ", " ")

        records.append(
            rg.TokenClassificationRecord(
                text=block_text,
                tokens=[tok.text for tok in nlp(block_text)],
                metadata=doc_metadata | block_metadata,
                id=f"{block.text_block_id}_{document.document_id}",
            )
        )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1142/1142 [00:16<00:00, 69.00it/s]


In [6]:
random.shuffle(records)

dataset_metadata = {
    "documents": [doc.document_id for doc in dataset.documents],
}

rg.log(
    records,
    name=DATASET_NAME,
    metadata=dataset_metadata,
)

BulkResponse(dataset='explorer-quality-testing', processed=1916, failed=0)