In [27]:
# Quick verification of Episode 2 clip counts
import pandas as pd
from pathlib import Path

ANNOT_DIR = Path("dset/annotations")
s01e02_files = [f for f in ANNOT_DIR.iterdir() if f.name.startswith("S01E02") and f.name.endswith("_FC.csv")]
s01e02_files.sort()

print("Episode 2 actual clip counts:")
total = 0
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    run_num = int(file_path.name.split('R')[1].split('_')[0])
    
    # Check first and last clip indices
    first_clip = df.iloc[0]['index']
    last_clip = df.iloc[-1]['index']
    clip_count = len(df)
    
    print(f"Run {run_num}: {clip_count} clips ({first_clip} to {last_clip})")
    total += clip_count

print(f"Total: {total} clips")
print(f"Expected: 2079")
print(f"Matches expectation: {total == 2079}")

Episode 2 actual clip counts:
Run 1: 313 clips (S01E02R01_clip0000 to S01E02R01_clip0312)
Run 2: 250 clips (S01E02R02_clip0000 to S01E02R02_clip0249)
Run 3: 362 clips (S01E02R03_clip0000 to S01E02R03_clip0361)
Run 4: 354 clips (S01E02R04_clip0000 to S01E02R04_clip0353)
Run 5: 295 clips (S01E02R05_clip0000 to S01E02R05_clip0294)
Run 6: 218 clips (S01E02R06_clip0000 to S01E02R06_clip0217)
Run 7: 294 clips (S01E02R07_clip0000 to S01E02R07_clip0293)
Total: 2086 clips
Expected: 2079
Matches expectation: False


In [28]:
# Check for missing clip indices that might explain the discrepancy
print("Checking for gaps in clip sequences:")
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    run_num = int(file_path.name.split('R')[1].split('_')[0])
    
    # Extract clip numbers
    clip_numbers = []
    for index_val in df['index']:
        if '_clip' in str(index_val):
            clip_num = int(str(index_val).split('_clip')[-1])
            clip_numbers.append(clip_num)
    
    clip_numbers.sort()
    expected_range = list(range(0, len(clip_numbers)))
    
    print(f"Run {run_num}:")
    print(f"  Actual clip numbers: {clip_numbers[0]}-{clip_numbers[-1]} (count: {len(clip_numbers)})")
    print(f"  Expected range: 0-{len(clip_numbers)-1} (count: {len(expected_range)})")
    
    # Check for gaps
    missing = set(expected_range) - set(clip_numbers)
    extra = set(clip_numbers) - set(expected_range)
    
    if missing:
        print(f"  Missing clips: {sorted(missing)}")
    if extra:
        print(f"  Extra clips: {sorted(extra)}")
    if not missing and not extra and clip_numbers == expected_range:
        print(f"  ✅ Sequence is complete and correct")
    
    # If expecting 312 clips but have 313, maybe clip should end at 0311?
    if run_num == 1:
        print(f"  If run 1 should have 312 clips (0000-0311), then clip 0312 is extra")
        
print(f"\nIf each run should have one fewer clip:")
corrected_total = 0
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    corrected_count = len(df) - 1  # Remove one clip
    corrected_total += corrected_count

print(f"Corrected total: {corrected_total}")
print(f"Target: 2079")
print(f"Matches target: {corrected_total == 2079}")

Checking for gaps in clip sequences:
Run 1:
  Actual clip numbers: 0-312 (count: 313)
  Expected range: 0-312 (count: 313)
  ✅ Sequence is complete and correct
  If run 1 should have 312 clips (0000-0311), then clip 0312 is extra
Run 2:
  Actual clip numbers: 0-249 (count: 250)
  Expected range: 0-249 (count: 250)
  ✅ Sequence is complete and correct
Run 3:
  Actual clip numbers: 0-361 (count: 362)
  Expected range: 0-361 (count: 362)
  ✅ Sequence is complete and correct
Run 4:
  Actual clip numbers: 0-353 (count: 354)
  Expected range: 0-353 (count: 354)
  ✅ Sequence is complete and correct
Run 5:
  Actual clip numbers: 0-294 (count: 295)
  Expected range: 0-294 (count: 295)
  ✅ Sequence is complete and correct
Run 6:
  Actual clip numbers: 0-217 (count: 218)
  Expected range: 0-217 (count: 218)
  ✅ Sequence is complete and correct
Run 7:
  Actual clip numbers: 0-293 (count: 294)
  Expected range: 0-293 (count: 294)
  ✅ Sequence is complete and correct

If each run should have one few

In [29]:
from pathlib import Path
import re
import pandas as pd
import sys
import os.path as op

In [30]:

ANNOT_DIR = Path("dset/annotations")
OUT_DIR = Path("dset/derivatives/annotations")
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [31]:
def find_annotation_files(annotation_dir: Path):
    pattern = re.compile(r"^(S\d+E\d+R\d+)_([A-Za-z]{2})\.csv$")
    groups = {}
    for p in sorted(annotation_dir.iterdir()):
        if not p.is_file():
            continue
        m = pattern.match(p.name)
        if not m:
            continue
        prefix = m.group(1)
        annot = m.group(2)
        groups.setdefault(prefix, []).append((annot, p))
    return groups

In [32]:
def detect_columns(df: pd.DataFrame):
    # detect index column
    if "index" in df.columns:
        idx_col = "index"
    else:
        # fallback: first column
        idx_col = df.columns[0]

    # detect valence and arousal columns (case-insensitive)
    val_cols = [c for c in df.columns if "valence" in c.lower()]
    aro_cols = [c for c in df.columns if "arousal" in c.lower()]

    val_col = val_cols[0] if val_cols else None
    aro_col = aro_cols[0] if aro_cols else None
    return idx_col, val_col, aro_col

In [33]:
def combine_group(prefix, files):
    # files: list of (annotator_code, Path)
    
    # Skip if less than 2 files
    if len(files) < 2:
        print(f"Skipping {prefix}: need at least 2 files, found {len(files)}")
        return None
    
    dfs = []
    renamed = []
    valid_count = 0  # Track valid files separately from loop index
    
    for annot, path in sorted(files):
        df = pd.read_csv(path)
        idx_col, val_col, aro_col = detect_columns(df)
        if val_col is None or aro_col is None:
            print(f"Skipping {path.name}: could not find valence/arousal columns", file=sys.stderr)
            continue

        # create minimal df using valid_count for consistent numbering
        valid_count += 1
        out = pd.DataFrame()
        out["index"] = df[idx_col]
        out[f"valence_{valid_count}"] = df[val_col].values
        out[f"arousal_{valid_count}"] = df[aro_col].values
        dfs.append(out)
        renamed.append((annot, f"valence_{valid_count}", f"arousal_{valid_count}"))

    if len(dfs) < 2:
        print(f"Skipping {prefix}: need at least 2 valid files after filtering, found {len(dfs)}")
        return None

    # merge on index
    merged = dfs[0]
    for d in dfs[1:]:
        merged = pd.merge(merged, d, on="index", how="outer")

    # sort by index if numeric
    try:
        merged["index"] = pd.to_numeric(merged["index"])
        merged = merged.sort_values("index").reset_index(drop=True)
    except Exception:
        merged = merged.reset_index(drop=True)

    # Reorder columns: index, all valence_*, then all arousal_*
    val_cols = [c for c in merged.columns if c.startswith("valence_")]
    aro_cols = [c for c in merged.columns if c.startswith("arousal_")]

    def _num_suffix(colname):
        parts = colname.split("_")
        try:
            return int(parts[-1])
        except Exception:
            return 0

    val_cols = sorted(val_cols, key=_num_suffix)
    aro_cols = sorted(aro_cols, key=_num_suffix)

    new_cols = ["index"] + val_cols + aro_cols
    # keep any unexpected columns at the end (shouldn't typically occur)
    tail = [c for c in merged.columns if c not in new_cols]
    merged = merged[new_cols + tail]

    # Remove rows with insufficient ratings and document removals
    initial_rows = len(merged)
    removed_clips = []
    
    # Extract episode and run info from prefix for sequential numbering
    episode_match = re.match(r"(S\d+E\d+)R(\d+)", prefix)
    episode_id = episode_match.group(1) if episode_match else "Unknown"
    run_number = int(episode_match.group(2)) if episode_match else 0
    
    # Check each row for sufficient ratings and valid scores
    rows_to_keep = []
    for i, row in merged.iterrows():
        # Extract clip number from index (assuming format like "S01E01R01_clip0001")
        try:
            if isinstance(row['index'], str) and '_clip' in row['index']:
                clip_num_str = row['index'].split('_clip')[-1]
                clip_number = int(clip_num_str)
            else:
                # Fallback: use row position
                clip_number = i + 1
        except (ValueError, IndexError):
            clip_number = i + 1
        
        # Count non-null valence ratings and check for scores > 7
        val_ratings = []
        val_invalid_scores = []
        for col in val_cols:
            if pd.notna(row[col]):
                try:
                    score = float(row[col])  # Convert to numeric
                    if score > 7:
                        val_invalid_scores.append(score)
                    else:
                        val_ratings.append(score)
                except (ValueError, TypeError):
                    # Skip non-numeric values
                    print(f"Warning: Non-numeric valence value {row[col]} in {col} for index {row['index']}")
                    continue
        
        # Count non-null arousal ratings and check for scores > 7
        aro_ratings = []
        aro_invalid_scores = []
        for col in aro_cols:
            if pd.notna(row[col]):
                try:
                    score = float(row[col])  # Convert to numeric
                    if score > 7:
                        aro_invalid_scores.append(score)
                    else:
                        aro_ratings.append(score)
                except (ValueError, TypeError):
                    # Skip non-numeric values
                    print(f"Warning: Non-numeric arousal value {row[col]} in {col} for index {row['index']}")
                    continue
        
        # Determine removal reason
        removal_reasons = []
        if len(val_ratings) < 2:
            removal_reasons.append(f"insufficient valence ratings ({len(val_ratings)})")
        if len(aro_ratings) < 2:
            removal_reasons.append(f"insufficient arousal ratings ({len(aro_ratings)})")
        if val_invalid_scores:
            removal_reasons.append(f"valence scores > 7: {val_invalid_scores}")
        if aro_invalid_scores:
            removal_reasons.append(f"arousal scores > 7: {aro_invalid_scores}")
        
        # Keep row only if it has at least 2 valid ratings for both valence and arousal
        if len(val_ratings) >= 2 and len(aro_ratings) >= 2 and not val_invalid_scores and not aro_invalid_scores:
            rows_to_keep.append(i)
        else:
            removed_clips.append({
                'episode': episode_id,
                'run': run_number,
                'run_prefix': prefix,
                'index': row['index'],
                'clip_number': clip_number,
                'valence_ratings': len(val_ratings),
                'arousal_ratings': len(aro_ratings),
                'valence_invalid_scores': len(val_invalid_scores),
                'arousal_invalid_scores': len(aro_invalid_scores),
                'reason': "; ".join(removal_reasons)
            })
    
    # Filter the dataframe
    if rows_to_keep:
        merged_filtered = merged.iloc[rows_to_keep].reset_index(drop=True)
    else:
        merged_filtered = pd.DataFrame(columns=merged.columns)
    
    final_rows = len(merged_filtered)
    
    # Report removals
    if removed_clips:
        print(f"  Removed {len(removed_clips)}/{initial_rows} clips with insufficient ratings:")
        for clip in removed_clips[:5]:  # Show first 5 removals
            print(f"    - {clip['index']}: {clip['reason']}")
        if len(removed_clips) > 5:
            print(f"    ... and {len(removed_clips) - 5} more")
    
    return merged_filtered, renamed, removed_clips

In [34]:
groups = find_annotation_files(ANNOT_DIR)

# Process all files but organize by episode instead of saving individual run files
episode_data = {}
all_removals = []

for prefix, files in groups.items():
    out = combine_group(prefix, files)
    if out is None:
        print(f"No valid files for {prefix}")
        continue
    
    merged, renamed, removed_clips = out
    
    # Extract episode from prefix (e.g., S01E01R01 -> S01E01)
    episode_match = re.match(r"(S\d+E\d+)R\d+", prefix)
    if episode_match:
        episode = episode_match.group(1)
        
        # Add run identifier column
        merged['run'] = prefix
        
        # Store in episode data structure
        if episode not in episode_data:
            episode_data[episode] = []
        episode_data[episode].append((prefix, merged, len(files), len(removed_clips)))
    
    # Track removals for documentation
    if removed_clips:
        all_removals.extend(removed_clips)
    
    print(f"Processed {prefix} ({len(files)} files -> {len(merged)} rows, {len(removed_clips) if removed_clips else 0} clips removed)")

# Create episode-level CSV files directly
print(f"\nCreating episode-level files for {len(episode_data)} episodes...")
episode_summary = []

for episode, run_data_list in episode_data.items():
    print(f"\nCombining runs for {episode}:")
    
    episode_dfs = []
    for run_prefix, run_df, file_count, removed_count in sorted(run_data_list):
        episode_dfs.append(run_df)
        print(f"  Added {run_prefix}: {len(run_df)} clips ({file_count} files, {removed_count} clips removed)")
    
    if episode_dfs:
        # Concatenate all runs for this episode
        episode_combined = pd.concat(episode_dfs, ignore_index=True)
        
        # Reorder columns to put run first, then index, then emotions
        val_cols = [c for c in episode_combined.columns if c.startswith("valence_")]
        aro_cols = [c for c in episode_combined.columns if c.startswith("arousal_")]
        other_cols = [c for c in episode_combined.columns if c not in ['run', 'index'] + val_cols + aro_cols]
        
        new_order = ['run', 'index'] + val_cols + aro_cols + other_cols
        episode_combined = episode_combined[new_order]
        
        # Save episode-level file
        episode_path = OUT_DIR / f"{episode}.csv"
        episode_combined.to_csv(episode_path, index=False)
        
        total_clips = len(episode_combined)
        episode_summary.append((episode, len(run_data_list), episode_path, total_clips))
        print(f"  Saved {episode_path}: {total_clips} total clips across {len(run_data_list)} runs")

print(f"\nEpisode-level files created:")
for episode, run_count, file_path, clip_count in episode_summary:
    print(f"  {episode}: {run_count} runs, {clip_count} clips -> {file_path}")

# Save removal documentation
if all_removals:
    removal_df = pd.DataFrame(all_removals)
    
    # Debug: Check what columns we actually have
    print(f"\nDEBUG: Removal dataframe columns: {list(removal_df.columns)}")
    print(f"DEBUG: First few rows of removal data:")
    print(removal_df.head())
    
    # Calculate sequential position within each episode
    removal_df_with_positions = []
    
    for episode in removal_df['episode'].unique():
        episode_removals = removal_df[removal_df['episode'] == episode].copy()
        
        # Build complete episode clip list from ORIGINAL annotation files
        # This ensures we capture ALL clips, including those that were removed
        episode_all_clips = []
        
        # Find all annotation groups for this episode and use FC files specifically
        episode_prefixes = [prefix for prefix in groups.keys() if prefix.startswith(episode)]
        
        for prefix in sorted(episode_prefixes):
            files = groups[prefix]
            # Use FC files specifically (most complete annotation set)
            fc_files = [f for f in files if f[1].name.endswith('_FC.csv')]
            
            if fc_files:
                fc_file = fc_files[0][1]  # Get the first FC file for this run
                df = pd.read_csv(fc_file)
                # Extract clip numbers and add to episode list
                run_clips = df['index'].tolist()
                episode_all_clips.extend(run_clips)
        
        # Create position mapping
        clip_to_position = {clip: pos for pos, clip in enumerate(sorted(episode_all_clips))}
        
        # Add positions to removals - use the correct column name
        if 'index' in episode_removals.columns:
            episode_removals['episode_position'] = episode_removals['index'].map(clip_to_position)
        else:
            print(f"WARNING: 'index' column not found in episode_removals for {episode}")
            print(f"Available columns: {list(episode_removals.columns)}")
            episode_removals['episode_position'] = None
        
        removal_df_with_positions.append(episode_removals)
    
    # Combine all episodes
    final_removal_df = pd.concat(removal_df_with_positions, ignore_index=True)
    
    # Save removal log
    removal_path = OUT_DIR / "removed_clips_log.csv"
    final_removal_df.to_csv(removal_path, index=False)
    print(f"\nSaved removal log: {removal_path}")
    
    # Print summary by episode
    print(f"\nRemovals by episode:")
    for episode in sorted(removal_df['episode'].unique()):
        episode_removals = removal_df[removal_df['episode'] == episode]
        episode_total = len([x for x in episode_all_clips if x.startswith(episode)])  # Rough estimate
        pct = (len(episode_removals) / episode_total * 100) if episode_total > 0 else 0
        print(f"  {episode}: {len(episode_removals)}/{episode_total} clips ({pct:.1f}%)")
        
    unmapped = final_removal_df[final_removal_df['episode_position'].isna()]
    if len(unmapped) > 0:
        print(f"\nWARNING: {len(unmapped)} clips could not be mapped to episode positions")
else:
    print("No clips were removed during processing.")

  Removed 36/334 clips with insufficient ratings:
    - S01E01R01_clip0029: insufficient arousal ratings (1); arousal scores > 7: [10.0]
    - S01E01R01_clip0030: insufficient arousal ratings (1); arousal scores > 7: [10.0]
    - S01E01R01_clip0031: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    - S01E01R01_clip0032: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    - S01E01R01_clip0033: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    ... and 31 more
Processed S01E01R01 (2 files -> 298 rows, 36 clips removed)
Processed S01E01R02 (2 files -> 276 rows, 0 clips removed)
  Removed 48/275 clips with insufficient ratings:
    - S01E01R03_clip0147: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0148: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0149: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0150: insufficient arousal ratings (1); arousal sco

In [35]:
episode3_df = pd.read_csv(op.join(OUT_DIR, "S01E03.csv"))

In [36]:
# Display the enhanced removal log with episode positions
import os
import pandas as pd

if os.path.exists(OUT_DIR / "removed_clips_log.csv"):
    removal_log = pd.read_csv(OUT_DIR / "removed_clips_log.csv")
    
    print("="*80)
    print("ENHANCED REMOVAL LOG WITH EPISODE POSITIONS")
    print("="*80)
    print(f"Total removed clips: {len(removal_log)}")
    print(f"Columns: {list(removal_log.columns)}")
    
    print("\n" + "="*60)
    print("SAMPLE OF REMOVAL LOG (first 10 entries):")
    print("="*60)
    print(removal_log.head(10).to_string(index=False))
    
    print("\n" + "="*60)
    print("EPISODE POSITION EXAMPLES:")
    print("="*60)
    for episode in sorted(removal_log['episode'].unique())[:2]:  # Show first 2 episodes
        episode_data = removal_log[removal_log['episode'] == episode].head(5)
        print(f"\n{episode} (showing first 5 removals):")
        for _, row in episode_data.iterrows():
            pos = row['episode_position']
            pos_str = f"{pos:.0f}" if pd.notna(pos) else "?"
            print(f"  Position {pos_str:>3}: {row['index']} - {row['reason']}")
    
    print("\n" + "="*60)
    print("EPISODE SUMMARY:")
    print("="*60)
    episode_stats = removal_log.groupby('episode').size().reset_index(name='removed_count')
    
    # Calculate total clips per episode from the episode CSV files  
    for _, row in episode_stats.iterrows():
        episode_name = row['episode']
        removed_count = row['removed_count']
        
        # Get total clips from the episode file
        episode_file = OUT_DIR / f"{episode_name}.csv"
        if episode_file.exists():
            episode_df = pd.read_csv(episode_file)
            # Total clips = kept clips + removed clips
            kept_clips = len(episode_df)
            total_clips = kept_clips + removed_count
            percentage = (removed_count / total_clips * 100) if total_clips > 0 else 0
            print(f"{episode_name}: {removed_count:3d}/{total_clips:3d} removed ({percentage:5.1f}%)")
        else:
            print(f"{episode_name}: {removed_count:3d}/??? removed (episode file not found)")
    
else:
    print("No removal log found. Run the annotation processing first.")

ENHANCED REMOVAL LOG WITH EPISODE POSITIONS
Total removed clips: 208
Columns: ['episode', 'run', 'run_prefix', 'index', 'clip_number', 'valence_ratings', 'arousal_ratings', 'valence_invalid_scores', 'arousal_invalid_scores', 'reason', 'episode_position']

SAMPLE OF REMOVAL LOG (first 10 entries):
episode  run run_prefix              index  clip_number  valence_ratings  arousal_ratings  valence_invalid_scores  arousal_invalid_scores                                                       reason  episode_position
 S01E01    1  S01E01R01 S01E01R01_clip0029           29                2                1                       0                       1 insufficient arousal ratings (1); arousal scores > 7: [10.0]               NaN
 S01E01    1  S01E01R01 S01E01R01_clip0030           30                2                1                       0                       1 insufficient arousal ratings (1); arousal scores > 7: [10.0]               NaN
 S01E01    1  S01E01R01 S01E01R01_clip0031         