# Loading a sample of data into Argilla 

Loads some unlabelled text blocks into Argilla for an entity annotation task. For demo purposes.

In [2]:
import sys

!{sys.executable} -m pip install argilla

Collecting argilla
  Downloading argilla-1.6.0-py3-none-any.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting monotonic
  Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting deprecated~=1.2.0
  Using cached Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting backoff
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting wrapt<1.15,>=1.13
  Using cached wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl (35 kB)
Collecting rich<=13.0.1
  Downloading rich-13.0.1-py3-none-any.whl (238 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.1/238.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting commonmark<0.10.0,>=0.9.0
  Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Installing collected packages: monotonic, commonmark, wrapt, rich

In [8]:
import os

from cpr_data_access.models import Dataset, BaseDocument
from dotenv import load_dotenv, find_dotenv
import argilla as rg
from tqdm.auto import tqdm
import spacy

load_dotenv(find_dotenv())
nlp = spacy.load("en_core_web_sm")

In [6]:
# User management is done at a workspace level
rg.init(workspace="gst", api_key=os.environ["ARGILLA_API_KEY"])
dataset = Dataset(document_model=BaseDocument).load_from_local(
    os.environ["DOCS_DIR_GST"], limit=10
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 24.71it/s]


In [9]:
records = []

for document in tqdm(dataset.documents):
    if document.text_blocks is None:
        print(f"Skipping {document.document_id} as no text blocks")
        continue

    doc_metadata = document.dict(exclude={"text_blocks", "page_metadata"})

    for block in document.text_blocks:
        block_metadata = block.dict(exclude={"text"})
        block_text = block.to_string()

        records.append(
            rg.TokenClassificationRecord(
                text=block_text,
                tokens=[tok.text for tok in nlp(block_text)],
                metadata=doc_metadata | block_metadata,
                id=f"{block.text_block_id}_{document.document_id}",
            )
        )

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:44<00:00,  4.49s/it]


In [11]:
dataset_metadata = {
    "documents": [doc.document_id for doc in dataset.documents],
}

rg.log(
    records,
    name="demo_unlabelled",
    metadata=dataset_metadata,
)

BulkResponse(dataset='demo_unlabelled', processed=5489, failed=0)