In [39]:
from pathlib import Path
import re
import pandas as pd
import sys

In [40]:
DATA_DIR = "./dset"
DERIVATIVES_DIR = "./derivatives"
ANNOT_DIR = Path("dset/annotations")
OUT_DIR = Path("derivatives/annotations")
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [41]:
def find_annotation_files(annotation_dir: Path):
    pattern = re.compile(r"^(S\d+E\d+R\d+)_([A-Za-z]{2})\.csv$")
    groups = {}
    for p in sorted(annotation_dir.iterdir()):
        if not p.is_file():
            continue
        m = pattern.match(p.name)
        if not m:
            continue
        prefix = m.group(1)
        annot = m.group(2)
        groups.setdefault(prefix, []).append((annot, p))
    return groups

In [42]:
def detect_columns(df: pd.DataFrame):
    # detect index column
    if "index" in df.columns:
        idx_col = "index"
    else:
        # fallback: first column
        idx_col = df.columns[0]

    # detect valence and arousal columns (case-insensitive)
    val_cols = [c for c in df.columns if "valence" in c.lower()]
    aro_cols = [c for c in df.columns if "arousal" in c.lower()]

    val_col = val_cols[0] if val_cols else None
    aro_col = aro_cols[0] if aro_cols else None
    return idx_col, val_col, aro_col

In [43]:
def combine_group(prefix, files):
    # files: list of (annotator_code, Path)
    
    # Skip if less than 2 files
    if len(files) < 2:
        print(f"Skipping {prefix}: need at least 2 files, found {len(files)}")
        return None
    
    dfs = []
    renamed = []
    valid_count = 0  # Track valid files separately from loop index
    
    for annot, path in sorted(files):
        df = pd.read_csv(path)
        idx_col, val_col, aro_col = detect_columns(df)
        if val_col is None or aro_col is None:
            print(f"Skipping {path.name}: could not find valence/arousal columns", file=sys.stderr)
            continue

        # create minimal df using valid_count for consistent numbering
        valid_count += 1
        out = pd.DataFrame()
        out["index"] = df[idx_col]
        out[f"valence_{valid_count}"] = df[val_col].values
        out[f"arousal_{valid_count}"] = df[aro_col].values
        dfs.append(out)
        renamed.append((annot, f"valence_{valid_count}", f"arousal_{valid_count}"))

    if len(dfs) < 2:
        print(f"Skipping {prefix}: need at least 2 valid files after filtering, found {len(dfs)}")
        return None

    # merge on index
    merged = dfs[0]
    for d in dfs[1:]:
        merged = pd.merge(merged, d, on="index", how="outer")

    # sort by index if numeric
    try:
        merged["index"] = pd.to_numeric(merged["index"])
        merged = merged.sort_values("index").reset_index(drop=True)
    except Exception:
        merged = merged.reset_index(drop=True)

    # Reorder columns: index, all valence_*, then all arousal_*
    val_cols = [c for c in merged.columns if c.startswith("valence_")]
    aro_cols = [c for c in merged.columns if c.startswith("arousal_")]

    def _num_suffix(colname):
        parts = colname.split("_")
        try:
            return int(parts[-1])
        except Exception:
            return 0

    val_cols = sorted(val_cols, key=_num_suffix)
    aro_cols = sorted(aro_cols, key=_num_suffix)

    new_cols = ["index"] + val_cols + aro_cols
    # keep any unexpected columns at the end (shouldn't typically occur)
    tail = [c for c in merged.columns if c not in new_cols]
    merged = merged[new_cols + tail]

    return merged, renamed

In [44]:
groups = find_annotation_files(ANNOT_DIR)

summary = []
for prefix, files in groups.items():
    out = combine_group(prefix, files)
    if out is None:
        print(f"No valid files for {prefix}")
        continue
    merged, renamed = out
    out_path = OUT_DIR / f"{prefix}.csv"
    merged.to_csv(out_path, index=False)
    print(f"Wrote {out_path} ({len(files)} files -> {len(merged)} rows)")
    summary.append((prefix, len(files), out_path))

print("\nFinished. Combined groups:")
for s in summary:
    print(f"  {s[0]}: {s[1]} files -> {s[2]}")

Wrote derivatives/annotations/S01E01R01.csv (2 files -> 334 rows)
Wrote derivatives/annotations/S01E01R02.csv (2 files -> 276 rows)
Wrote derivatives/annotations/S01E01R03.csv (2 files -> 275 rows)
Skipping S01E01R04: need at least 2 files, found 1
No valid files for S01E01R04
Skipping S01E01R05: need at least 2 files, found 1
No valid files for S01E01R05
Skipping S01E01R06: need at least 2 files, found 1
No valid files for S01E01R06
Wrote derivatives/annotations/S01E02R01.csv (2 files -> 313 rows)
Wrote derivatives/annotations/S01E02R02.csv (8 files -> 250 rows)
Wrote derivatives/annotations/S01E02R03.csv (2 files -> 362 rows)
Wrote derivatives/annotations/S01E02R04.csv (2 files -> 354 rows)
Wrote derivatives/annotations/S01E02R05.csv (2 files -> 295 rows)
Wrote derivatives/annotations/S01E02R06.csv (2 files -> 218 rows)
Wrote derivatives/annotations/S01E02R07.csv (2 files -> 294 rows)
Wrote derivatives/annotations/S01E03R01.csv (2 files -> 272 rows)
Wrote derivatives/annotations/S01E