In [None]:
from datetime import datetime
import os

from datasets import load_dataset
from dotenv import load_dotenv
import requests


load_dotenv()

True

In [130]:
from typing import Union, List


def get_label_studio_data(
    task_id: Union[int, List[int]] = None,
    host: str = None,
    token: str = None,
    project_id: int = 6,
) -> dict:
    """
    Requests annotated data from label studio via the API. The request can be limited to reduced data by:
    either specifying an task_id (in that case only data from successive tasks will be returned);
    or by specifying a list of task_ids (in that case data will be recovered for those tasks).

    Parameters
    ----------
    task_id: Union[int, List[int]]
        A either an integer task_id (in that case only data from successive tasks will be returned)
        or a list of task_ids (in that case data will be recovered for those tasks).
    host: str = None
        Hostname for the Label Studio endpoint. If None, then the value will have to be set via the 
        LABEL_STUDIO_HOST environment variable.
    token: str = None
        API token to connect to the Label Studio endpoint. If None, then the value will have to be set via the 
        LABEL_STUDIO_TOKEN environment variable.
    project_id: int = 6
        Project id as defined in Label Studio.
    """
    if token is None:
        token = os.getenv("LABEL_STUDIO_TOKEN")
    if host is None:
        host = os.getenv("LABEL_STUDIO_HOST")

    assert token is not None, (
        "`token` needs to be set as function argument or as the environment variable `LABEL_STUDIO_TOKEN`.",
        "In case both are set, the function argument will override the environment variable.",
    )
    assert host is not None, (
        "`host` needs to be set as function argument or as the environment variable `LABEL_STUDIO_HOST`.",
        "In case both are set, the function argument will override the environment variable.",
    )
    if isinstance(task_id, int):
        url_query = f"ids[]>{task_id}&"
    elif isinstance(task_id, list):
        url_query = "".join([f"ids[]>{_task_id}&" for _task_id in task_id])
    else:
        url_query = ""
    url = f"https://{host}/api/projects/{project_id}/export?{url_query}exportType=JSON"
    headers = {"Authorization": f"Token {token}"}
    response = requests.get(url=url, headers=headers)
    response.raise_for_status()
    return response.json()


In [None]:
data = get_label_studio_data()

In [112]:
len(data)

666

In [None]:

def get_week_number(record):
    format_str = "%Y-%m-%dT%H:%M:%S"
    if len(record["data"]["item"]["start"]) > 19:
        format_str = format_str + "%z"
    week_number = (
        datetime.strptime(record["data"]["item"]["start"], format_str)
        .isocalendar()
        .week
    )
    return {"week_number": week_number}


get_week_number(data[0])

{'week_number': 2}

In [114]:
def get_annotations_from_record(record: dict) -> dict:
    cards_claims = []
    cards_labels = []
    misinformation_claims = []
    comments = []
    misinformation_bool = False
    for annotation_session in record["annotations"]:
        if annotation_session["was_cancelled"]:
            continue
        else:
            for annotation_record in annotation_session["result"]:
                if annotation_record["from_name"] == "choice":
                    misinformation_bool = (
                        "Correct" in annotation_record["value"]["choices"]
                    )
                elif annotation_record["from_name"] == "cards":
                    cards_claims.append(
                        {
                            "text": annotation_record["value"]["text"],
                            "labels": annotation_record["value"]["labels"],
                        }
                    )
                    cards_labels.extend(annotation_record["value"]["labels"])
                elif annotation_record["from_name"] == "misinformation":
                    misinformation_claims.append(
                        {
                            "text": annotation_record["value"]["text"],
                            "labels": annotation_record["value"]["labels"],
                        }
                    )
                elif annotation_record["from_name"] == "comments":
                    comments.append("\n".join(annotation_record["value"]["text"]))
        return {
            "misinformation": misinformation_bool,
            "cards_claims": cards_claims,
            "misinformation_claims": misinformation_claims,
            "comments": comments,
        }
    return {
        "misinformation": False,
        "cards_labels": [],
        "cards_claims": [],
        "misinformation_claims": [],
        "comments": comments,
    }

In [115]:
get_annotations_from_record(data[0])

{'misinformation': False,
 'cards_claims': [],
 'misinformation_claims': [],
 'comments': ['Gros faux-positif']}

In [116]:
def process_record(record):
    record_data = {"label_studio_id": record["id"]}
    record_data.update(record["data"]["item"])
    record_data.update(get_week_number(record=record))
    record_data.update(get_annotations_from_record(record=record))
    return record_data


process_record(data[0])

{'label_studio_id': 1810,
 'id': '382b3d21738c3d0e33d751bd3a74953d648f93490b37554641484c77c611729a',
 'day': 10,
 'year': 2025,
 'month': 1,
 'start': '2025-01-10T06:06:00+00:00',
 'channel': 'itele',
 'plaintext': "pour l' instant on ne sait pas vraiment si c' est un un incendiaire qui s' est confirmée en fait que ce soit un incendiaire ou pas voilà l' enquête dira si ce sont des incendies volontaires mais il y a des y a des suspicions c' est ce que vous nous dites merci beaucoup ramzy marzouki joe biden de x et le changement climatique m qui explique ces incendie michel chevalet c' est le changement climatique ces incendies un je vous rappelle tout de même pas ils sont dix fois où il y en a conscience que j' ai un sécheresse de vent violent trois maquis airs maquis secs qui et aide maquis les troupes qui aide sujets les donc taupes dû être celle que vous allez voir c' est un cocktail d' état alors le changement climatique le réchauffement climatique oubliez les un degré cinq en calif

In [117]:
parsed_data = []
parsed_data_test = []
for record in data:
    parsed_record = process_record(record)
    if hash(str(parsed_record["week_number"]) + str(parsed_record["year"])) % 4:
        parsed_data.append(parsed_record)
    else:
        parsed_data_test.append(parsed_record)
print(len(parsed_data_test), len(parsed_data))

137 529


In [118]:
from datasets import Dataset, DatasetDict

In [None]:
dataset = DatasetDict(
    {
        "train": Dataset.from_list(parsed_data),
        "test": Dataset.from_list(parsed_data_test),
    }
)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label_studio_id', 'id', 'day', 'year', 'month', 'start', 'channel', 'plaintext', 'model_name', 'channel_name', 'model_reason', 'model_result', 'channel_title', 'url_mediatree', 'channel_program', 'plaintext_whisper', 'channel_program_type', 'week_number', 'misinformation', 'cards_claims', 'misinformation_claims', 'comments'],
        num_rows: 529
    })
    test: Dataset({
        features: ['label_studio_id', 'id', 'day', 'year', 'month', 'start', 'channel', 'plaintext', 'model_name', 'channel_name', 'model_reason', 'model_result', 'channel_title', 'url_mediatree', 'channel_program', 'plaintext_whisper', 'channel_program_type', 'week_number', 'misinformation', 'cards_claims', 'misinformation_claims', 'comments'],
        num_rows: 137
    })
})

In [129]:
# TODO: Add section where you get the old dataset and the data contained in it, 
# append the data that is not already present in the dataset 
# before pushing to hub

In [None]:
dataset.push_to_hub("DataForGood/climateguard", private=False, token=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DataForGood/climateguard/commit/4904c03d4283a89a69fa998ae6d97a970bb1574f', commit_message='Upload dataset', commit_description='', oid='4904c03d4283a89a69fa998ae6d97a970bb1574f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DataForGood/climateguard', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DataForGood/climateguard'), pr_revision=None, pr_num=None)