In [1]:
from datetime import datetime
import os

from datasets import load_dataset
from dotenv import load_dotenv
import requests


load_dotenv()

True

In [2]:
from typing import Union, List


def get_label_studio_data(
    task_id: Union[int, List[int]] = None,
    host: str = None,
    token: str = None,
    project_id: int = 6,
) -> dict:
    """
    Requests annotated data from label studio via the API. The request can be limited to reduced data by:
    either specifying an task_id (in that case only data from successive tasks will be returned);
    or by specifying a list of task_ids (in that case data will be recovered for those tasks).

    Parameters
    ----------
    task_id: Union[int, List[int]]
        A either an integer task_id (in that case only data from successive tasks will be returned)
        or a list of task_ids (in that case data will be recovered for those tasks).
    host: str = None
        Hostname for the Label Studio endpoint. If None, then the value will have to be set via the 
        LABEL_STUDIO_HOST environment variable.
    token: str = None
        API token to connect to the Label Studio endpoint. If None, then the value will have to be set via the 
        LABEL_STUDIO_TOKEN environment variable.
    project_id: int = 6
        Project id as defined in Label Studio.
    """
    if token is None:
        token = os.getenv("LABEL_STUDIO_TOKEN")
    if host is None:
        host = os.getenv("LABEL_STUDIO_HOST")

    assert token is not None, (
        "`token` needs to be set as function argument or as the environment variable `LABEL_STUDIO_TOKEN`.",
        "In case both are set, the function argument will override the environment variable.",
    )
    assert host is not None, (
        "`host` needs to be set as function argument or as the environment variable `LABEL_STUDIO_HOST`.",
        "In case both are set, the function argument will override the environment variable.",
    )
    if isinstance(task_id, int):
        url_query = f"ids[]>{task_id}&"
    elif isinstance(task_id, list):
        url_query = "".join([f"ids[]>{_task_id}&" for _task_id in task_id])
    else:
        url_query = ""
    url = f"https://{host}/api/projects/{project_id}/export?{url_query}exportType=JSON"
    headers = {"Authorization": f"Token {token}"}
    response = requests.get(url=url, headers=headers)
    response.raise_for_status()
    return response.json()


In [3]:
data = get_label_studio_data()

In [4]:
len(data)

741

In [5]:

def get_week_number(record):
    format_str = "%Y-%m-%dT%H:%M:%S"
    if len(record["data"]["item"]["start"]) > 19:
        format_str = format_str + "%z"
    week_number = (
        datetime.strptime(record["data"]["item"]["start"], format_str)
        .isocalendar()
        .week
    )
    return {"week_number": week_number}


get_week_number(data[0])

{'week_number': 40}

In [6]:
def get_annotations_from_record(record: dict) -> dict:
    cards_claims = []
    cards_labels = []
    misinformation_claims = []
    comments = []
    misinformation_bool = False
    for annotation_session in record["annotations"]:
        if annotation_session["was_cancelled"]:
            continue
        else:
            for annotation_record in annotation_session["result"]:
                if annotation_record["from_name"] == "choice":
                    misinformation_bool = (
                        "Correct" in annotation_record["value"]["choices"]
                    )
                elif annotation_record["from_name"] == "cards":
                    cards_claims.append(
                        {
                            "text": annotation_record["value"]["text"],
                            "labels": annotation_record["value"]["labels"],
                        }
                    )
                    cards_labels.extend(annotation_record["value"]["labels"])
                elif annotation_record["from_name"] == "misinformation":
                    misinformation_claims.append(
                        {
                            "text": annotation_record["value"]["text"],
                            "labels": annotation_record["value"]["labels"],
                        }
                    )
                elif annotation_record["from_name"] == "comments":
                    comments.append("\n".join(annotation_record["value"]["text"]))
        return {
            "misinformation": misinformation_bool,
            "cards_claims": cards_claims,
            "misinformation_claims": misinformation_claims,
            "comments": comments,
        }
    return {
        "misinformation": False,
        "cards_labels": [],
        "cards_claims": [],
        "misinformation_claims": [],
        "comments": comments,
    }

In [7]:
get_annotations_from_record(data[0])

{'misinformation': True,
 'cards_claims': [{'text': '', 'labels': ['2. Humans are not the cause']}],
 'misinformation_claims': [],
 'comments': []}

In [8]:
def process_record(record):
    record_data = {"label_studio_id": record["id"]}
    record_data.update(record["data"]["item"])
    record_data.update(get_week_number(record=record))
    record_data.update(get_annotations_from_record(record=record))
    return record_data


process_record(data[0])

{'label_studio_id': 1604,
 'id': '712ded15b00734e052d1e2f5dd23a73a2e6d4ff2cbd31703d5efe29730a5eeee',
 'day': 2,
 'year': 2023,
 'month': 10,
 'start': '2023-10-02T16:46:00',
 'channel': 'itele',
 'plaintext': "des anomalies l'europe centrale la france mais aussi au niveau global les plus jeunes un chien les hausses les anomalies et les matches à trois degrés de plus cette année il y a aussi une qui ligne a aussi qui un accélère maintenant ces prochains mois c'est juste un petit à petit les prévisions saisonnières pour le mois d'octobre ma vie ce chantier vingt-six plus chaudes jamais cessé surtout la france aussi l'italie l'allemagne norvège plouguenast accès c'est d'accord on va juste prendre de réaction de nos invités puissent vous demanderez peut-être si on est là vraiment dans les conséquences du réchauffement climatique tels qu'on le le phage gèrent des conséquences de de nos actions sur le réchauffement climatique tels qu'on les imagine on les on envisage les ivan envisage rioufo

In [9]:
parsed_data = []
parsed_data_test = []
for record in data:
    parsed_record = process_record(record)
    if hash(str(parsed_record["week_number"]) + str(parsed_record["year"])) % 4:
        parsed_data.append(parsed_record)
    else:
        parsed_data_test.append(parsed_record)
print(len(parsed_data_test), len(parsed_data))

173 568


In [10]:
from datasets import Dataset, DatasetDict

In [11]:
dataset = DatasetDict(
    {
        "train": Dataset.from_list(parsed_data),
        "test": Dataset.from_list(parsed_data_test),
    }
)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label_studio_id', 'id', 'day', 'year', 'month', 'start', 'channel', 'plaintext', 'model_name', 'channel_name', 'model_reason', 'model_result', 'channel_title', 'url_mediatree', 'channel_program', 'plaintext_whisper', 'channel_program_type', 'week_number', 'misinformation', 'cards_claims', 'misinformation_claims', 'comments'],
        num_rows: 568
    })
    test: Dataset({
        features: ['label_studio_id', 'id', 'day', 'year', 'month', 'start', 'channel', 'plaintext', 'model_name', 'channel_name', 'model_reason', 'model_result', 'channel_title', 'url_mediatree', 'channel_program', 'plaintext_whisper', 'channel_program_type', 'week_number', 'misinformation', 'cards_claims', 'misinformation_claims', 'comments'],
        num_rows: 173
    })
})

In [13]:
# TODO: Add section where you get the old dataset and the data contained in it, 
# append the data that is not already present in the dataset 
# before pushing to hub

In [14]:
dataset.push_to_hub("DataForGood/climateguard", private=False, token=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DataForGood/climateguard/commit/c81c932a5236b348971f5ec5d48c2939ff058a8d', commit_message='Upload dataset', commit_description='', oid='c81c932a5236b348971f5ec5d48c2939ff058a8d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DataForGood/climateguard', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DataForGood/climateguard'), pr_revision=None, pr_num=None)