In [1]:
from pathlib import Path
import re
import pandas as pd
import sys

In [2]:
DATA_DIR = "./dset"
DERIVATIVES_DIR = "./derivatives"
ANNOT_DIR = Path("dset/annotations")
OUT_DIR = Path("derivatives/annotations")
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
def find_annotation_files(annotation_dir: Path):
    pattern = re.compile(r"^(S\d+E\d+R\d+)_([A-Za-z]{2})\.csv$")
    groups = {}
    for p in sorted(annotation_dir.iterdir()):
        if not p.is_file():
            continue
        m = pattern.match(p.name)
        if not m:
            continue
        prefix = m.group(1)
        annot = m.group(2)
        groups.setdefault(prefix, []).append((annot, p))
    return groups

In [4]:
def detect_columns(df: pd.DataFrame):
    # detect index column
    if "index" in df.columns:
        idx_col = "index"
    else:
        # fallback: first column
        idx_col = df.columns[0]

    # detect valence and arousal columns (case-insensitive)
    val_cols = [c for c in df.columns if "valence" in c.lower()]
    aro_cols = [c for c in df.columns if "arousal" in c.lower()]

    val_col = val_cols[0] if val_cols else None
    aro_col = aro_cols[0] if aro_cols else None
    return idx_col, val_col, aro_col

In [5]:
def combine_group(prefix, files):
    # files: list of (annotator_code, Path)
    
    # Skip if less than 2 files
    if len(files) < 2:
        print(f"Skipping {prefix}: need at least 2 files, found {len(files)}")
        return None
    
    dfs = []
    renamed = []
    valid_count = 0  # Track valid files separately from loop index
    
    for annot, path in sorted(files):
        df = pd.read_csv(path)
        idx_col, val_col, aro_col = detect_columns(df)
        if val_col is None or aro_col is None:
            print(f"Skipping {path.name}: could not find valence/arousal columns", file=sys.stderr)
            continue

        # create minimal df using valid_count for consistent numbering
        valid_count += 1
        out = pd.DataFrame()
        out["index"] = df[idx_col]
        out[f"valence_{valid_count}"] = df[val_col].values
        out[f"arousal_{valid_count}"] = df[aro_col].values
        dfs.append(out)
        renamed.append((annot, f"valence_{valid_count}", f"arousal_{valid_count}"))

    if len(dfs) < 2:
        print(f"Skipping {prefix}: need at least 2 valid files after filtering, found {len(dfs)}")
        return None

    # merge on index
    merged = dfs[0]
    for d in dfs[1:]:
        merged = pd.merge(merged, d, on="index", how="outer")

    # sort by index if numeric
    try:
        merged["index"] = pd.to_numeric(merged["index"])
        merged = merged.sort_values("index").reset_index(drop=True)
    except Exception:
        merged = merged.reset_index(drop=True)

    # Reorder columns: index, all valence_*, then all arousal_*
    val_cols = [c for c in merged.columns if c.startswith("valence_")]
    aro_cols = [c for c in merged.columns if c.startswith("arousal_")]

    def _num_suffix(colname):
        parts = colname.split("_")
        try:
            return int(parts[-1])
        except Exception:
            return 0

    val_cols = sorted(val_cols, key=_num_suffix)
    aro_cols = sorted(aro_cols, key=_num_suffix)

    new_cols = ["index"] + val_cols + aro_cols
    # keep any unexpected columns at the end (shouldn't typically occur)
    tail = [c for c in merged.columns if c not in new_cols]
    merged = merged[new_cols + tail]

    # Remove rows with insufficient ratings and document removals
    initial_rows = len(merged)
    removed_clips = []
    
    # Check each row for sufficient ratings
    rows_to_keep = []
    for i, row in merged.iterrows():
        # Count non-null valence ratings
        val_ratings = [row[col] for col in val_cols if pd.notna(row[col])]
        # Count non-null arousal ratings  
        aro_ratings = [row[col] for col in aro_cols if pd.notna(row[col])]
        
        # Keep row only if it has at least 2 ratings for both valence and arousal
        if len(val_ratings) >= 2 and len(aro_ratings) >= 2:
            rows_to_keep.append(i)
        else:
            removed_clips.append({
                'index': row['index'],
                'valence_ratings': len(val_ratings),
                'arousal_ratings': len(aro_ratings),
                'reason': f"Insufficient ratings (val: {len(val_ratings)}, aro: {len(aro_ratings)})"
            })
    
    # Filter the dataframe
    if rows_to_keep:
        merged_filtered = merged.iloc[rows_to_keep].reset_index(drop=True)
    else:
        merged_filtered = pd.DataFrame(columns=merged.columns)
    
    final_rows = len(merged_filtered)
    
    # Report removals
    if removed_clips:
        print(f"  Removed {len(removed_clips)}/{initial_rows} clips with insufficient ratings:")
        for clip in removed_clips[:5]:  # Show first 5 removals
            print(f"    - {clip['index']}: {clip['reason']}")
        if len(removed_clips) > 5:
            print(f"    ... and {len(removed_clips) - 5} more")
    
    return merged_filtered, renamed, removed_clips

In [6]:
groups = find_annotation_files(ANNOT_DIR)

summary = []
all_removals = []

for prefix, files in groups.items():
    out = combine_group(prefix, files)
    if out is None:
        print(f"No valid files for {prefix}")
        continue
    
    merged, renamed, removed_clips = out
    
    # Save the combined file
    out_path = OUT_DIR / f"{prefix}.csv"
    merged.to_csv(out_path, index=False)
    print(f"Wrote {out_path} ({len(files)} files -> {len(merged)} rows)")
    
    # Track removals for documentation
    if removed_clips:
        for clip in removed_clips:
            clip['run'] = prefix
        all_removals.extend(removed_clips)
    
    summary.append((prefix, len(files), out_path, len(removed_clips)))

print("\nFinished. Combined groups:")
for s in summary:
    print(f"  {s[0]}: {s[1]} files -> {s[2]} ({s[3]} clips removed)")

# Save removal documentation
if all_removals:
    removal_df = pd.DataFrame(all_removals)
    removal_path = OUT_DIR / "removed_clips_log.csv"
    removal_df.to_csv(removal_path, index=False)
    print(f"\nRemoval log saved to: {removal_path}")
    print(f"Total clips removed across all runs: {len(all_removals)}")
    
    # Summary by reason
    removal_summary = removal_df.groupby('reason').size().reset_index(name='count')
    print(f"\nRemoval reasons:")
    for _, row in removal_summary.iterrows():
        print(f"  {row['reason']}: {row['count']} clips")
else:
    print("\nNo clips were removed - all had sufficient ratings.")

Wrote derivatives/annotations/S01E01R01.csv (2 files -> 334 rows)
Wrote derivatives/annotations/S01E01R02.csv (2 files -> 276 rows)
Wrote derivatives/annotations/S01E01R03.csv (2 files -> 275 rows)
Skipping S01E01R04: need at least 2 files, found 1
No valid files for S01E01R04
Skipping S01E01R05: need at least 2 files, found 1
No valid files for S01E01R05
Skipping S01E01R06: need at least 2 files, found 1
No valid files for S01E01R06
  Removed 8/313 clips with insufficient ratings:
    - S01E02R01_clip0000: Insufficient ratings (val: 1, aro: 1)
    - S01E02R01_clip0178: Insufficient ratings (val: 1, aro: 1)
    - S01E02R01_clip0179: Insufficient ratings (val: 1, aro: 1)
    - S01E02R01_clip0180: Insufficient ratings (val: 1, aro: 1)
    - S01E02R01_clip0181: Insufficient ratings (val: 1, aro: 1)
    ... and 3 more
Wrote derivatives/annotations/S01E02R01.csv (2 files -> 305 rows)
Wrote derivatives/annotations/S01E02R02.csv (8 files -> 250 rows)
Wrote derivatives/annotations/S01E02R03.cs

In [8]:
# Combine runs by episode to create episode-level CSVs
episode_groups = {}

# Group the existing combined CSVs by episode
for prefix, file_count, file_path, removed_count in summary:
    # Extract episode from prefix (e.g., S01E01R01 -> S01E01)
    episode_match = re.match(r"(S\d+E\d+)R\d+", prefix)
    if episode_match:
        episode = episode_match.group(1)
        episode_groups.setdefault(episode, []).append((prefix, file_path))

print(f"Found {len(episode_groups)} episodes:")
for episode, runs in episode_groups.items():
    print(f"  {episode}: {len(runs)} runs")

# Create combined episode files
episode_summary = []
for episode, runs in episode_groups.items():
    print(f"\nCombining runs for {episode}:")
    
    episode_dfs = []
    for run_prefix, run_path in sorted(runs):
        # Load the run data
        run_df = pd.read_csv(run_path)
        
        # Add run identifier column
        run_df['run'] = run_prefix
        episode_dfs.append(run_df)
        print(f"  Added {run_prefix}: {len(run_df)} clips")
    
    if episode_dfs:
        # Concatenate all runs for this episode
        episode_combined = pd.concat(episode_dfs, ignore_index=True)
        
        # Reorder columns to put run first, then index, then emotions
        val_cols = [c for c in episode_combined.columns if c.startswith("valence_")]
        aro_cols = [c for c in episode_combined.columns if c.startswith("arousal_")]
        other_cols = [c for c in episode_combined.columns if c not in ['run', 'index'] + val_cols + aro_cols]
        
        new_order = ['run', 'index'] + val_cols + aro_cols + other_cols
        episode_combined = episode_combined[new_order]
        
        # Save episode-level file
        episode_path = OUT_DIR / f"{episode}.csv"
        episode_combined.to_csv(episode_path, index=False)
        
        total_clips = len(episode_combined)
        episode_summary.append((episode, len(runs), episode_path, total_clips))
        print(f"  Saved {episode_path}: {total_clips} total clips across {len(runs)} runs")

print(f"\nEpisode-level files created:")
for episode, run_count, file_path, clip_count in episode_summary:
    print(f"  {episode}: {run_count} runs, {clip_count} clips -> {file_path}")

Found 4 episodes:
  S01E01: 3 runs
  S01E02: 7 runs
  S01E03: 6 runs
  S01E04: 5 runs

Combining runs for S01E01:
  Added S01E01R01: 334 clips
  Added S01E01R02: 276 clips
  Added S01E01R03: 275 clips
  Saved derivatives/annotations/S01E01.csv: 885 total clips across 3 runs

Combining runs for S01E02:
  Added S01E02R01: 305 clips
  Added S01E02R02: 250 clips
  Added S01E02R03: 362 clips
  Added S01E02R04: 354 clips
  Added S01E02R05: 295 clips
  Added S01E02R06: 192 clips
  Added S01E02R07: 294 clips
  Saved derivatives/annotations/S01E02.csv: 2052 total clips across 7 runs

Combining runs for S01E03:
  Added S01E03R01: 272 clips
  Added S01E03R02: 309 clips
  Added S01E03R03: 317 clips
  Added S01E03R04: 359 clips
  Added S01E03R05: 268 clips
  Added S01E03R06: 415 clips
  Saved derivatives/annotations/S01E03.csv: 1940 total clips across 6 runs

Combining runs for S01E04:
  Added S01E04R01: 307 clips
  Added S01E04R02: 296 clips
  Added S01E04R03: 293 clips
  Added S01E04R04: 334 clip