[![Run this Code In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/damieh1/ISCA_Instagram_Project/blob/main/parse_label_studio.ipynb)

""""
----------------------- --------------------------------------------
| Script generates a pd.df based the annotations from LabelStudio  |
--------------------------------------------------------------------
Parse a Label Studio JSON export (for image classification project) into a tidy pandas DataFrame.

Works with the labeling config that includes the following controls:
- emotional_content (single choice)
- text_present (single choice)
- scene_types (multiple choice)
- support_for_terror (single choice)
- stance_target (multiple choice, optional)
- emotion_impact (rating, 1..7)
- dominant_emotion (single choice)
- transcribed_text (textarea)
- notes (textarea)

USAGE
-----
1. Import you Data to Colab - Colab asks for direct file import
2. Run parsing functions
3. Generate dataframe
4. Backup CSV for the parse
5. Combine and match the parse with the raw data # no code written yet
-------
"""

In [None]:
#Importing the data
from google.colab import files
uploaded = files.upload()

Saving project-43-at-2025-08-21-10-02-06ae6b7c.json to project-43-at-2025-08-21-10-02-06ae6b7c.json


In [None]:
# Rename upload
json = uploaded

In [None]:
# Run Parsing functions
from __future__ import annotations
import argparse
import json
from typing import Any, Dict, List, Optional, Tuple, Iterable
import pandas as pd
from pathlib import Path

# Controls expected in config
SINGLE_CHOICE_FIELDS = {"emotional_content", "text_present", "support_for_terror", "dominant_emotion"}
MULTI_CHOICE_FIELDS  = {"scene_types", "stance_target"}
RATING_FIELDS        = {"emotion_impact"}
TEXTAREA_FIELDS      = {"transcribed_text", "notes"}

ALL_FIELDS = list(SINGLE_CHOICE_FIELDS | MULTI_CHOICE_FIELDS | RATING_FIELDS | TEXTAREA_FIELDS)

def _safe_get(d: dict, *keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def _result_to_values(res: dict) -> Tuple[str, Any]:
    field = res.get("from_name")
    rtype = res.get("type")
    value = res.get("value", {})

    if rtype == "choices":
        choices = value.get("choices", [])
        return field, list(choices)  # keep list even for single-choice
    elif rtype == "rating":
        return field, value.get("rating")
    elif rtype == "textarea":
        texts = value.get("text", [])
        joined = "\n".join([t for t in texts if isinstance(t, str)])
        return field, joined
    else:
        # Unknown/unsupported type — keep raw
        return field, value

def _annotation_to_record(task: dict, ann: dict) -> dict:
        #Flatten a single annotation into a record.
    record = {
        "task_id": task.get("id"),
        "annotation_id": ann.get("id"),
        "project": task.get("project"),
        "image": _safe_get(task, "data", "image"),
        "annotator_id": _safe_get(ann, "completed_by", "id") or ann.get("completed_by"),
        "annotator_username": _safe_get(ann, "completed_by", "email") or _safe_get(ann, "completed_by", "username"),
        "created_at": ann.get("created_at"),
        "updated_at": ann.get("updated_at"),
        "lead_time": ann.get("lead_time"),
        # Defaults for all expected fields
        **{f: None for f in ALL_FIELDS},
    }

    results: List[dict] = ann.get("result") or []
    # not yet tested
    if not results and "results" in ann:
        results = ann["results"]

    for res in results:
        field, val = _result_to_values(res)
        if not field:
            continue

        if field in MULTI_CHOICE_FIELDS:
            # ensure list
            if isinstance(val, list):
                record[field] = val
            elif val is None:
                record[field] = []
            else:
                record[field] = [val]
        elif field in SINGLE_CHOICE_FIELDS:
            # pick first if list
            if isinstance(val, list):
                record[field] = val[0] if val else None
            else:
                record[field] = val
        elif field in RATING_FIELDS:
            record[field] = val
        elif field in TEXTAREA_FIELDS:
            record[field] = val
        else:
            # unknown control; store as-is
            record[field] = val

    return record

def _iter_records(tasks: Iterable[dict]) -> Iterable[dict]:
    for t in tasks:
        annotations = t.get("annotations") or []
        # Some exports may use 'completions' (legacy)
        if not annotations and "completions" in t:
            annotations = t["completions"]
        if not annotations:
            # produce an empty "annotation" record with task metadata only
            yield {
                "task_id": t.get("id"),
                "annotation_id": None,
                "project": t.get("project"),
                "image": _safe_get(t, "data", "image"),
                **{f: None for f in ALL_FIELDS},
            }
            continue

        for ann in annotations:
            yield _annotation_to_record(t, ann)

def _one_hot_multi(df: pd.DataFrame, fields: List[str]) -> pd.DataFrame:
    df = df.copy()
    for f in fields:
        # Gather unique values
        unique_vals = sorted({v for lst in df[f].dropna().tolist() for v in (lst if isinstance(lst, list) else [])})
        for u in unique_vals:
            col = f"{f}__{u}".replace(" ", "_").replace("/", "_").replace("-", "_").replace("(", "").replace(")", "")
            df[col] = df[f].apply(lambda x: int(isinstance(x, list) and (u in x)))
    return df

def _explode_multi(df: pd.DataFrame, fields: List[str]) -> pd.DataFrame:
    df = df.copy()
    for f in fields:
        df[f] = df[f].apply(lambda v: v if isinstance(v, list) and v else [None])
        df = df.explode(f, ignore_index=True)
    return df

def parse_labelstudio_to_df(
    export_json_path: str | Path,
    explode_multi: bool = False,
    one_hot: bool = False
) -> pd.DataFrame: # Load a Label Studio JSON export and convert to a tidy DataFrame.
    p = Path(export_json_path)
    with p.open("r", encoding="utf-8") as f:
        data = json.load(f)

    # Exports can be dict {"tasks": [...]} or list of tasks
    tasks = data.get("tasks") if isinstance(data, dict) else data
    if tasks is None:
        raise ValueError("Could not find tasks in the export JSON. Expected a list or a dict with key 'tasks'.")

    records = list(_iter_records(tasks))
    df = pd.DataFrame.from_records(records)

    # Normalize types
    if "emotion_impact" in df.columns:
        df["emotion_impact"] = pd.to_numeric(df["emotion_impact"], errors="coerce")

    # Ensure lists for multi fields (for empty annotations)
    for f in MULTI_CHOICE_FIELDS:
        if f in df.columns:
            df[f] = df[f].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))

    if explode_multi:
        df = _explode_multi(df, sorted(MULTI_CHOICE_FIELDS))

    if one_hot:
        df = _one_hot_multi(df, sorted(MULTI_CHOICE_FIELDS))

    return df

In [None]:
# Generate pd.df
file_name = list(uploaded.keys())[0]
df = parse_labelstudio_to_df(file_name)
display(df.head())

Unnamed: 0,task_id,annotation_id,project,image,annotator_id,annotator_username,created_at,updated_at,lead_time,scene_types,emotion_impact,transcribed_text,text_present,emotional_content,dominant_emotion,stance_target,support_for_terror,notes
0,50745,4777,43,/data/upload/43/633a2ba7-00b293c5940bca885af4c...,1,,2025-08-21T13:59:31.570615Z,2025-08-21T13:59:31.570615Z,86.898,[Children present],2.0,,No text,No (neutral/informational),Empathy / Compassion,[Other / N/A],No,
1,50746,4778,43,/data/upload/43/9e1f3a74-0ae9e0896a5284d0-C4CB...,1,,2025-08-21T14:00:47.965448Z,2025-08-21T14:00:47.965448Z,54.783,[Other / not listed],3.0,,Text visible,Yes (clearly emotional),Anger,[Anti-Israel],No,
2,50747,4779,43,/data/upload/43/9c637d29-0-EB806.png,1,,2025-08-21T14:01:06.893513Z,2025-08-21T14:01:06.893513Z,17.033,[Other / not listed],,,No text,No (neutral/informational),Neutral,[Other / N/A],No,
3,50748,4780,43,/data/upload/43/30bfc2f9-0fa81a183df314c9443a1...,1,,2025-08-21T14:01:20.738830Z,2025-08-21T14:01:20.738830Z,12.425,[],2.0,,No text,Unclear,Fear,[Other / N/A],No,
4,50749,4781,43,/data/upload/43/7e06ea9a-1b7fae36776a21d746df2...,1,,2025-08-21T14:02:13.345014Z,2025-08-21T14:02:13.345014Z,50.606,"[Interview / talk show / speaker portrait, Oth...",,,No text,No (neutral/informational),Other,[Other / N/A],No,


In [None]:
# Generate backup CSV from annotation parse
df.to_csv('labelstudio_output.csv', index=False)
print("DataFrame saved to labelstudio_output.csv")

DataFrame saved to labelstudio_output.csv
