In [10]:
# Quick verification of Episode 2 clip counts
import pandas as pd
from pathlib import Path

ANNOT_DIR = Path("dset/annotations")
s01e02_files = [f for f in ANNOT_DIR.iterdir() if f.name.startswith("S01E02") and f.name.endswith("_FC.csv")]
s01e02_files.sort()

print("Episode 2 actual clip counts:")
total = 0
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    run_num = int(file_path.name.split('R')[1].split('_')[0])
    
    # Check first and last clip indices
    first_clip = df.iloc[0]['index']
    last_clip = df.iloc[-1]['index']
    clip_count = len(df)
    
    print(f"Run {run_num}: {clip_count} clips ({first_clip} to {last_clip})")
    total += clip_count

print(f"Total: {total} clips")
print(f"Expected: 2079")
print(f"Matches expectation: {total == 2079}")

Episode 2 actual clip counts:
Run 1: 313 clips (S01E02R01_clip0000 to S01E02R01_clip0312)
Run 2: 250 clips (S01E02R02_clip0000 to S01E02R02_clip0249)
Run 3: 362 clips (S01E02R03_clip0000 to S01E02R03_clip0361)
Run 4: 354 clips (S01E02R04_clip0000 to S01E02R04_clip0353)
Run 5: 295 clips (S01E02R05_clip0000 to S01E02R05_clip0294)
Run 6: 218 clips (S01E02R06_clip0000 to S01E02R06_clip0217)
Run 7: 294 clips (S01E02R07_clip0000 to S01E02R07_clip0293)
Total: 2086 clips
Expected: 2079
Matches expectation: False


In [11]:
# Check for missing clip indices that might explain the discrepancy
print("Checking for gaps in clip sequences:")
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    run_num = int(file_path.name.split('R')[1].split('_')[0])
    
    # Extract clip numbers
    clip_numbers = []
    for index_val in df['index']:
        if '_clip' in str(index_val):
            clip_num = int(str(index_val).split('_clip')[-1])
            clip_numbers.append(clip_num)
    
    clip_numbers.sort()
    expected_range = list(range(0, len(clip_numbers)))
    
    print(f"Run {run_num}:")
    print(f"  Actual clip numbers: {clip_numbers[0]}-{clip_numbers[-1]} (count: {len(clip_numbers)})")
    print(f"  Expected range: 0-{len(clip_numbers)-1} (count: {len(expected_range)})")
    
    # Check for gaps
    missing = set(expected_range) - set(clip_numbers)
    extra = set(clip_numbers) - set(expected_range)
    
    if missing:
        print(f"  Missing clips: {sorted(missing)}")
    if extra:
        print(f"  Extra clips: {sorted(extra)}")
    if not missing and not extra and clip_numbers == expected_range:
        print(f"  ✅ Sequence is complete and correct")
    
    # If expecting 312 clips but have 313, maybe clip should end at 0311?
    if run_num == 1:
        print(f"  If run 1 should have 312 clips (0000-0311), then clip 0312 is extra")
        
print(f"\nIf each run should have one fewer clip:")
corrected_total = 0
for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    corrected_count = len(df) - 1  # Remove one clip
    corrected_total += corrected_count

print(f"Corrected total: {corrected_total}")
print(f"Target: 2079")
print(f"Matches target: {corrected_total == 2079}")

Checking for gaps in clip sequences:
Run 1:
  Actual clip numbers: 0-312 (count: 313)
  Expected range: 0-312 (count: 313)
  ✅ Sequence is complete and correct
  If run 1 should have 312 clips (0000-0311), then clip 0312 is extra
Run 2:
  Actual clip numbers: 0-249 (count: 250)
  Expected range: 0-249 (count: 250)
  ✅ Sequence is complete and correct
Run 3:
  Actual clip numbers: 0-361 (count: 362)
  Expected range: 0-361 (count: 362)
  ✅ Sequence is complete and correct
Run 4:
  Actual clip numbers: 0-353 (count: 354)
  Expected range: 0-353 (count: 354)
  ✅ Sequence is complete and correct
Run 5:
  Actual clip numbers: 0-294 (count: 295)
  Expected range: 0-294 (count: 295)
  ✅ Sequence is complete and correct
Run 6:
  Actual clip numbers: 0-217 (count: 218)
  Expected range: 0-217 (count: 218)
  ✅ Sequence is complete and correct
Run 7:
  Actual clip numbers: 0-293 (count: 294)
  Expected range: 0-293 (count: 294)
  ✅ Sequence is complete and correct

If each run should have one few

In [1]:
from pathlib import Path
import re
import pandas as pd
import sys
import os.path as op

In [3]:

ANNOT_DIR = Path("dset/annotations")
OUT_DIR = Path("dset/derivatives/annotations")
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [4]:
def find_annotation_files(annotation_dir: Path):
    pattern = re.compile(r"^(S\d+E\d+R\d+)_([A-Za-z]{2})\.csv$")
    groups = {}
    for p in sorted(annotation_dir.iterdir()):
        if not p.is_file():
            continue
        m = pattern.match(p.name)
        if not m:
            continue
        prefix = m.group(1)
        annot = m.group(2)
        groups.setdefault(prefix, []).append((annot, p))
    return groups

In [5]:
def detect_columns(df: pd.DataFrame):
    # detect index column
    if "index" in df.columns:
        idx_col = "index"
    else:
        # fallback: first column
        idx_col = df.columns[0]

    # detect valence and arousal columns (case-insensitive)
    val_cols = [c for c in df.columns if "valence" in c.lower()]
    aro_cols = [c for c in df.columns if "arousal" in c.lower()]

    val_col = val_cols[0] if val_cols else None
    aro_col = aro_cols[0] if aro_cols else None
    return idx_col, val_col, aro_col

In [6]:
def combine_group(prefix, files):
    # files: list of (annotator_code, Path)
    
    # Skip if less than 2 files
    if len(files) < 2:
        print(f"Skipping {prefix}: need at least 2 files, found {len(files)}")
        return None
    
    dfs = []
    renamed = []
    valid_count = 0  # Track valid files separately from loop index
    
    for annot, path in sorted(files):
        df = pd.read_csv(path)
        idx_col, val_col, aro_col = detect_columns(df)
        if val_col is None or aro_col is None:
            print(f"Skipping {path.name}: could not find valence/arousal columns", file=sys.stderr)
            continue

        # create minimal df using valid_count for consistent numbering
        valid_count += 1
        out = pd.DataFrame()
        out["index"] = df[idx_col]
        out[f"valence_{valid_count}"] = df[val_col].values
        out[f"arousal_{valid_count}"] = df[aro_col].values
        dfs.append(out)
        renamed.append((annot, f"valence_{valid_count}", f"arousal_{valid_count}"))

    if len(dfs) < 2:
        print(f"Skipping {prefix}: need at least 2 valid files after filtering, found {len(dfs)}")
        return None

    # merge on index
    merged = dfs[0]
    for d in dfs[1:]:
        merged = pd.merge(merged, d, on="index", how="outer")

    # sort by index if numeric
    try:
        merged["index"] = pd.to_numeric(merged["index"])
        merged = merged.sort_values("index").reset_index(drop=True)
    except Exception:
        merged = merged.reset_index(drop=True)

    # Reorder columns: index, all valence_*, then all arousal_*
    val_cols = [c for c in merged.columns if c.startswith("valence_")]
    aro_cols = [c for c in merged.columns if c.startswith("arousal_")]

    def _num_suffix(colname):
        parts = colname.split("_")
        try:
            return int(parts[-1])
        except Exception:
            return 0

    val_cols = sorted(val_cols, key=_num_suffix)
    aro_cols = sorted(aro_cols, key=_num_suffix)

    new_cols = ["index"] + val_cols + aro_cols
    # keep any unexpected columns at the end (shouldn't typically occur)
    tail = [c for c in merged.columns if c not in new_cols]
    merged = merged[new_cols + tail]

    # Remove rows with insufficient ratings and document removals
    initial_rows = len(merged)
    removed_clips = []
    
    # Extract episode and run info from prefix for sequential numbering
    episode_match = re.match(r"(S\d+E\d+)R(\d+)", prefix)
    episode_id = episode_match.group(1) if episode_match else "Unknown"
    run_number = int(episode_match.group(2)) if episode_match else 0
    
    # Check each row for sufficient ratings and valid scores
    rows_to_keep = []
    for i, row in merged.iterrows():
        # Extract clip number from index (assuming format like "S01E01R01_clip0001")
        try:
            if isinstance(row['index'], str) and '_clip' in row['index']:
                clip_num_str = row['index'].split('_clip')[-1]
                clip_number = int(clip_num_str)
            else:
                # Fallback: use row position
                clip_number = i + 1
        except (ValueError, IndexError):
            clip_number = i + 1
        
        # Count non-null valence ratings and check for scores > 7
        val_ratings = []
        val_invalid_scores = []
        for col in val_cols:
            if pd.notna(row[col]):
                try:
                    score = float(row[col])  # Convert to numeric
                    if score > 7:
                        val_invalid_scores.append(score)
                    else:
                        val_ratings.append(score)
                except (ValueError, TypeError):
                    # Skip non-numeric values
                    print(f"Warning: Non-numeric valence value {row[col]} in {col} for index {row['index']}")
                    continue
        
        # Count non-null arousal ratings and check for scores > 7
        aro_ratings = []
        aro_invalid_scores = []
        for col in aro_cols:
            if pd.notna(row[col]):
                try:
                    score = float(row[col])  # Convert to numeric
                    if score > 7:
                        aro_invalid_scores.append(score)
                    else:
                        aro_ratings.append(score)
                except (ValueError, TypeError):
                    # Skip non-numeric values
                    print(f"Warning: Non-numeric arousal value {row[col]} in {col} for index {row['index']}")
                    continue
        
        # Determine removal reason
        removal_reasons = []
        if len(val_ratings) < 2:
            removal_reasons.append(f"insufficient valence ratings ({len(val_ratings)})")
        if len(aro_ratings) < 2:
            removal_reasons.append(f"insufficient arousal ratings ({len(aro_ratings)})")
        if val_invalid_scores:
            removal_reasons.append(f"valence scores > 7: {val_invalid_scores}")
        if aro_invalid_scores:
            removal_reasons.append(f"arousal scores > 7: {aro_invalid_scores}")
        
        # Keep row only if it has at least 2 valid ratings for both valence and arousal
        if len(val_ratings) >= 2 and len(aro_ratings) >= 2 and not val_invalid_scores and not aro_invalid_scores:
            rows_to_keep.append(i)
        else:
            removed_clips.append({
                'episode': episode_id,
                'run': run_number,
                'run_prefix': prefix,
                'index': row['index'],
                'clip_number': clip_number,
                'valence_ratings': len(val_ratings),
                'arousal_ratings': len(aro_ratings),
                'valence_invalid_scores': len(val_invalid_scores),
                'arousal_invalid_scores': len(aro_invalid_scores),
                'reason': "; ".join(removal_reasons)
            })
    
    # Filter the dataframe
    if rows_to_keep:
        merged_filtered = merged.iloc[rows_to_keep].reset_index(drop=True)
    else:
        merged_filtered = pd.DataFrame(columns=merged.columns)
    
    final_rows = len(merged_filtered)
    
    # Report removals
    if removed_clips:
        print(f"  Removed {len(removed_clips)}/{initial_rows} clips with insufficient ratings:")
        for clip in removed_clips[:5]:  # Show first 5 removals
            print(f"    - {clip['index']}: {clip['reason']}")
        if len(removed_clips) > 5:
            print(f"    ... and {len(removed_clips) - 5} more")
    
    return merged_filtered, renamed, removed_clips

In [None]:
groups = find_annotation_files(ANNOT_DIR)

summary = []
all_removals = []

for prefix, files in groups.items():
    out = combine_group(prefix, files)
    if out is None:
        print(f"No valid files for {prefix}")
        continue
    
    merged, renamed, removed_clips = out
    
    # Save the combined file
    out_path = OUT_DIR / f"{prefix}.csv"
    merged.to_csv(out_path, index=False)
    print(f"Wrote {out_path} ({len(files)} files -> {len(merged)} rows)")
    
    # Track removals for documentation
    if removed_clips:
        all_removals.extend(removed_clips)
    
    summary.append((prefix, len(files), out_path, len(removed_clips)))

print("\nFinished. Combined groups:")
for s in summary:
    print(f"  {s[0]}: {s[1]} files -> {s[2]} ({s[3]} clips removed)")

# Save removal documentation
if all_removals:
    removal_df = pd.DataFrame(all_removals)
    
    # Calculate sequential position within each episode
    removal_df_with_positions = []
    
    for episode in removal_df['episode'].unique():
        episode_removals = removal_df[removal_df['episode'] == episode].copy()
        
        # Build complete episode clip list from ORIGINAL annotation files
        # This ensures we capture ALL clips, including those that were removed
        episode_all_clips = []
        
        # Find all annotation groups for this episode and use FC files specifically
        episode_prefixes = [prefix for prefix in groups.keys() if prefix.startswith(episode)]
        
        for prefix in sorted(episode_prefixes):
            files = groups[prefix]
            # Use FC files specifically (most complete annotation set)
            fc_files = [f for f in files if f[1].name.endswith('_FC.csv')]
            
            if fc_files:
                fc_file = fc_files[0][1]  # Get the first FC file for this run
                try:
                    original_df = pd.read_csv(fc_file)
                    idx_col, val_col, aro_col = detect_columns(original_df)
                    
                    # Extract run number for sorting
                    run_match = re.search(r'R(\d+)', prefix)
                    run_num = int(run_match.group(1)) if run_match else 0
                    
                    # Process each clip index from original file
                    for idx, row in original_df.iterrows():
                        clip_index = row[idx_col]
                        try:
                            if isinstance(clip_index, str) and '_clip' in clip_index:
                                clip_num_str = clip_index.split('_clip')[-1]
                                clip_number = int(clip_num_str)
                            else:
                                clip_number = idx
                        except (ValueError, IndexError, AttributeError):
                            clip_number = idx
                        
                        episode_all_clips.append({
                            'run_prefix': prefix,
                            'run_num': run_num,
                            'index': clip_index,
                            'clip_number': clip_number
                        })
                except Exception as e:
                    print(f"  Warning: Could not read {fc_file}: {e}")
            else:
                # Fallback to first available file if no FC files
                if files:
                    first_file = sorted(files)[0][1]
                    try:
                        original_df = pd.read_csv(first_file)
                        idx_col, val_col, aro_col = detect_columns(original_df)
                        
                        # Extract run number for sorting
                        run_match = re.search(r'R(\d+)', prefix)
                        run_num = int(run_match.group(1)) if run_match else 0
                        
                        # Process each clip index from original file
                        for idx, row in original_df.iterrows():
                            clip_index = row[idx_col]
                            try:
                                if isinstance(clip_index, str) and '_clip' in clip_index:
                                    clip_num_str = clip_index.split('_clip')[-1]
                                    clip_number = int(clip_num_str)
                                else:
                                    clip_number = idx
                            except (ValueError, IndexError, AttributeError):
                                clip_number = idx
                            
                            episode_all_clips.append({
                                'run_prefix': prefix,
                                'run_num': run_num,
                                'index': clip_index,
                                'clip_number': clip_number
                            })
                    except Exception as e:
                        print(f"  Warning: Could not read {first_file}: {e}")
        
        # Sort all clips by run number, then by clip number to get sequential order
        if episode_all_clips:
            episode_all_clips_df = pd.DataFrame(episode_all_clips)
            episode_all_clips_df = episode_all_clips_df.sort_values(['run_num', 'clip_number'])
            episode_all_clips_df.reset_index(drop=True, inplace=True)
            
            # Assign episode positions starting from 1
            # Each run should contribute exactly the number of clips it contains
            episode_all_clips_df['episode_position'] = range(1, len(episode_all_clips_df) + 1)
            
            # Create lookup dictionary for positions
            position_lookup = dict(zip(episode_all_clips_df['index'], episode_all_clips_df['episode_position']))
            total_clips = len(episode_all_clips_df)
            
            # Debug: Print run-by-run breakdown
            print(f"  {episode} clip breakdown by run:")
            for run_num in sorted(episode_all_clips_df['run_num'].unique()):
                run_clips = episode_all_clips_df[episode_all_clips_df['run_num'] == run_num]
                min_pos = run_clips['episode_position'].min()
                max_pos = run_clips['episode_position'].max()
                count = len(run_clips)
                print(f"    Run {run_num}: {count} clips (positions {min_pos}-{max_pos})")
            
            # Map episode positions to removed clips
            episode_removals['episode_position'] = episode_removals['index'].map(position_lookup)
            episode_removals['total_clips_in_episode'] = total_clips
            
            # Debug: Show how many positions were successfully mapped
            mapped_count = episode_removals['episode_position'].notna().sum()
            total_removals = len(episode_removals)
            print(f"  {episode}: Mapped {mapped_count}/{total_removals} removal positions ({total_clips} total clips)")
            
            # Show unmapped clips for debugging
            unmapped = episode_removals[episode_removals['episode_position'].isna()]
            if len(unmapped) > 0:
                print(f"    Unmapped clips: {unmapped['index'].tolist()[:5]}...")  # Show first 5
        else:
            print(f"  {episode}: No clips found in original annotation files")
            episode_removals['episode_position'] = None
            episode_removals['total_clips_in_episode'] = 0
        
        removal_df_with_positions.append(episode_removals)
    
    # Combine all episodes
    final_removal_df = pd.concat(removal_df_with_positions, ignore_index=True)
    
    # Reorder columns for better readability
    column_order = ['episode', 'run', 'run_prefix', 'index', 'clip_number', 
                   'episode_position', 'total_clips_in_episode',
                   'valence_ratings', 'arousal_ratings', 
                   'valence_invalid_scores', 'arousal_invalid_scores', 'reason']
    final_removal_df = final_removal_df[column_order]
    
    # Sort by episode, then by episode position (putting NaN positions at end)
    final_removal_df = final_removal_df.sort_values(['episode', 'episode_position'], na_position='last')
    
    removal_path = OUT_DIR / "removed_clips_log.csv"
    final_removal_df.to_csv(removal_path, index=False)
    print(f"\nRemoval log saved to: {removal_path}")
    print(f"Total clips removed across all runs: {len(final_removal_df)}")
    
    # Summary by reason
    removal_summary = final_removal_df.groupby('reason').size().reset_index(name='count')
    print(f"\nRemoval reasons:")
    for _, row in removal_summary.iterrows():
        print(f"  {row['reason']}: {row['count']} clips")
        
    # Summary by episode
    episode_summary = final_removal_df.groupby('episode').agg({
        'episode_position': 'count',
        'total_clips_in_episode': 'first'
    }).rename(columns={'episode_position': 'removed_count'}).reset_index()
    
    print(f"\nRemovals by episode:")
    for _, row in episode_summary.iterrows():
        total_clips = row['total_clips_in_episode']
        removed_count = row['removed_count'] 
        percentage = (removed_count / total_clips * 100) if total_clips > 0 else 0
        print(f"  {row['episode']}: {removed_count}/{total_clips} clips ({percentage:.1f}%)")
        
    # Check for unmapped positions
    unmapped_total = final_removal_df['episode_position'].isna().sum()
    if unmapped_total > 0:
        print(f"\nWARNING: {unmapped_total} clips could not be mapped to episode positions")
else:
    print("\nNo clips were removed - all had sufficient ratings.")

  Removed 36/334 clips with insufficient ratings:
    - S01E01R01_clip0029: insufficient arousal ratings (1); arousal scores > 7: [10.0]
    - S01E01R01_clip0030: insufficient arousal ratings (1); arousal scores > 7: [10.0]
    - S01E01R01_clip0031: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    - S01E01R01_clip0032: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    - S01E01R01_clip0033: insufficient arousal ratings (1); arousal scores > 7: [9.0]
    ... and 31 more
Wrote dset/derivatives/annotations/S01E01R01.csv (2 files -> 298 rows)
Wrote dset/derivatives/annotations/S01E01R02.csv (2 files -> 276 rows)
  Removed 48/275 clips with insufficient ratings:
    - S01E01R03_clip0147: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0148: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0149: insufficient arousal ratings (1); arousal scores > 7: [8.0]
    - S01E01R03_clip0150: insufficient arousal r

In [7]:
# Combine runs by episode to create episode-level CSVs
episode_groups = {}

# Group the existing combined CSVs by episode
for prefix, file_count, file_path, removed_count in summary:
    # Extract episode from prefix (e.g., S01E01R01 -> S01E01)
    episode_match = re.match(r"(S\d+E\d+)R\d+", prefix)
    if episode_match:
        episode = episode_match.group(1)
        episode_groups.setdefault(episode, []).append((prefix, file_path))

print(f"Found {len(episode_groups)} episodes:")
for episode, runs in episode_groups.items():
    print(f"  {episode}: {len(runs)} runs")

# Create combined episode files
episode_summary = []
for episode, runs in episode_groups.items():
    print(f"\nCombining runs for {episode}:")
    
    episode_dfs = []
    for run_prefix, run_path in sorted(runs):
        # Load the run data
        run_df = pd.read_csv(run_path)
        
        # Add run identifier column
        run_df['run'] = run_prefix
        episode_dfs.append(run_df)
        print(f"  Added {run_prefix}: {len(run_df)} clips")
    
    if episode_dfs:
        # Concatenate all runs for this episode
        episode_combined = pd.concat(episode_dfs, ignore_index=True)
        
        # Reorder columns to put run first, then index, then emotions
        val_cols = [c for c in episode_combined.columns if c.startswith("valence_")]
        aro_cols = [c for c in episode_combined.columns if c.startswith("arousal_")]
        other_cols = [c for c in episode_combined.columns if c not in ['run', 'index'] + val_cols + aro_cols]
        
        new_order = ['run', 'index'] + val_cols + aro_cols + other_cols
        episode_combined = episode_combined[new_order]
        
        # Save episode-level file
        episode_path = OUT_DIR / f"{episode}.csv"
        episode_combined.to_csv(episode_path, index=False)
        
        total_clips = len(episode_combined)
        episode_summary.append((episode, len(runs), episode_path, total_clips))
        print(f"  Saved {episode_path}: {total_clips} total clips across {len(runs)} runs")

print(f"\nEpisode-level files created:")
for episode, run_count, file_path, clip_count in episode_summary:
    print(f"  {episode}: {run_count} runs, {clip_count} clips -> {file_path}")

Found 4 episodes:
  S01E01: 3 runs
  S01E02: 7 runs
  S01E03: 6 runs
  S01E04: 5 runs

Combining runs for S01E01:
  Added S01E01R01: 298 clips
  Added S01E01R02: 276 clips
  Added S01E01R03: 227 clips
  Saved dset/derivatives/annotations/S01E01.csv: 801 total clips across 3 runs

Combining runs for S01E02:
  Added S01E02R01: 303 clips
  Added S01E02R02: 230 clips
  Added S01E02R03: 356 clips
  Added S01E02R04: 350 clips
  Added S01E02R05: 292 clips
  Added S01E02R06: 192 clips
  Added S01E02R07: 281 clips
  Saved dset/derivatives/annotations/S01E02.csv: 2004 total clips across 7 runs

Combining runs for S01E03:
  Added S01E03R01: 261 clips
  Added S01E03R02: 309 clips
  Added S01E03R03: 316 clips
  Added S01E03R04: 359 clips
  Added S01E03R05: 268 clips
  Added S01E03R06: 408 clips
  Saved dset/derivatives/annotations/S01E03.csv: 1921 total clips across 6 runs

Combining runs for S01E04:
  Added S01E04R01: 307 clips
  Added S01E04R02: 296 clips
  Added S01E04R03: 293 clips
  Added S01E

In [8]:
episode3_df = pd.read_csv(op.join(OUT_DIR, "S01E03.csv"))

In [8]:
# Display the enhanced removal log with episode positions
import os
import pandas as pd

if os.path.exists(OUT_DIR / "removed_clips_log.csv"):
    removal_log = pd.read_csv(OUT_DIR / "removed_clips_log.csv")
    
    print("="*80)
    print("ENHANCED REMOVAL LOG WITH EPISODE POSITIONS")
    print("="*80)
    print(f"Total removed clips: {len(removal_log)}")
    print(f"Columns: {list(removal_log.columns)}")
    
    print("\n" + "="*60)
    print("SAMPLE OF REMOVAL LOG (first 10 entries):")
    print("="*60)
    print(removal_log.head(10).to_string(index=False))
    
    print("\n" + "="*60)
    print("EPISODE POSITION EXAMPLES:")
    print("="*60)
    for episode in sorted(removal_log['episode'].unique())[:2]:  # Show first 2 episodes
        episode_data = removal_log[removal_log['episode'] == episode].head(5)
        print(f"\n{episode} (showing first 5 removals):")
        for _, row in episode_data.iterrows():
            pos = row['episode_position']
            total = row['total_clips_in_episode']
            pos_str = f"{pos:.0f}" if pd.notna(pos) else "?"
            total_str = f"{total:.0f}" if pd.notna(total) else "?"
            print(f"  Position {pos_str:>3}/{total_str:<3}: "
                  f"{row['index']} - {row['reason']}")
    
    print("\n" + "="*60)
    print("EPISODE SUMMARY:")
    print("="*60)
    episode_stats = removal_log.groupby(['episode', 'total_clips_in_episode']).size().reset_index(name='removed_count')
    for _, row in episode_stats.iterrows():
        percentage = (row['removed_count'] / row['total_clips_in_episode'] * 100) if row['total_clips_in_episode'] > 0 else 0
        print(f"{row['episode']}: {row['removed_count']:3d}/{row['total_clips_in_episode']:3d} removed ({percentage:5.1f}%)")
    
else:
    print("No removal log found. Run the annotation processing first.")

ENHANCED REMOVAL LOG WITH EPISODE POSITIONS
Total removed clips: 208
Columns: ['episode', 'run', 'run_prefix', 'index', 'clip_number', 'episode_position', 'total_clips_in_episode', 'valence_ratings', 'arousal_ratings', 'valence_invalid_scores', 'arousal_invalid_scores', 'reason']

SAMPLE OF REMOVAL LOG (first 10 entries):
episode  run run_prefix              index  clip_number  episode_position  total_clips_in_episode  valence_ratings  arousal_ratings  valence_invalid_scores  arousal_invalid_scores                                                       reason
 S01E01    1  S01E01R01 S01E01R01_clip0029           29              30.0                    1816                2                1                       0                       1 insufficient arousal ratings (1); arousal scores > 7: [10.0]
 S01E01    1  S01E01R01 S01E01R01_clip0030           30              31.0                    1816                2                1                       0                       1 insufficient a

In [9]:
# Debug Episode 2 specifically
print("="*60)
print("DEBUGGING EPISODE 2 CLIP COUNT")
print("="*60)

# Check what groups were found for S01E02
s01e02_groups = {k: v for k, v in groups.items() if k.startswith("S01E02")}
print(f"S01E02 groups found: {len(s01e02_groups)}")

total_clips_manual = 0
for prefix in sorted(s01e02_groups.keys()):
    files = s01e02_groups[prefix]
    fc_files = [f for f in files if f[1].name.endswith('_FC.csv')]
    
    if fc_files:
        fc_file = fc_files[0][1]
        df = pd.read_csv(fc_file)
        clip_count = len(df)
        total_clips_manual += clip_count
        print(f"{prefix}: {clip_count} clips (using {fc_file.name})")
    else:
        print(f"{prefix}: No FC files found")

print(f"\nManual count total: {total_clips_manual}")
print(f"Expected total: 2079")
print(f"Discrepancy: {total_clips_manual - 2079}")

# Check what the removal log currently shows
removal_log = pd.read_csv(OUT_DIR / "removed_clips_log.csv")
s01e02_log = removal_log[removal_log['episode'] == 'S01E02']
if len(s01e02_log) > 0:
    print(f"\nRemoval log shows: {s01e02_log['total_clips_in_episode'].iloc[0]} total clips")
else:
    print("\nNo S01E02 entries in removal log")

# Let's check the actual structure created by the algorithm
print(f"\n" + "="*40)
print("CHECKING ACTUAL ALGORITHM OUTPUT")
print("="*40)

# Re-run the specific logic for S01E02
episode = "S01E02"
episode_all_clips = []

# Find all annotation groups for this episode and use FC files specifically
episode_prefixes = [prefix for prefix in groups.keys() if prefix.startswith(episode)]

for prefix in sorted(episode_prefixes):
    files = groups[prefix]
    # Use FC files specifically (most complete annotation set)
    fc_files = [f for f in files if f[1].name.endswith('_FC.csv')]
    
    if fc_files:
        fc_file = fc_files[0][1]  # Get the first FC file for this run
        try:
            original_df = pd.read_csv(fc_file)
            idx_col, val_col, aro_col = detect_columns(original_df)
            
            # Extract run number for sorting
            run_match = re.search(r'R(\d+)', prefix)
            run_num = int(run_match.group(1)) if run_match else 0
            
            print(f"{prefix}: Processing {len(original_df)} clips from {fc_file.name}")
            
            # Process each clip index from original file
            for idx, row in original_df.iterrows():
                clip_index = row[idx_col]
                try:
                    if isinstance(clip_index, str) and '_clip' in clip_index:
                        clip_num_str = clip_index.split('_clip')[-1]
                        clip_number = int(clip_num_str)
                    else:
                        clip_number = idx
                except (ValueError, IndexError, AttributeError):
                    clip_number = idx
                
                episode_all_clips.append({
                    'run_prefix': prefix,
                    'run_num': run_num,
                    'index': clip_index,
                    'clip_number': clip_number
                })
        except Exception as e:
            print(f"  Error reading {fc_file}: {e}")

print(f"\nAlgorithm produced: {len(episode_all_clips)} clips")
print(f"Expected: 2079")
print(f"Discrepancy: {len(episode_all_clips) - 2079}")

if len(episode_all_clips) != 2079:
    print(f"\n❌ Found the issue! The algorithm is producing {len(episode_all_clips)} clips instead of 2079")
else:
    print(f"\n✅ Algorithm is correct - issue must be elsewhere")

DEBUGGING EPISODE 2 CLIP COUNT
S01E02 groups found: 7
S01E02R01: 313 clips (using S01E02R01_FC.csv)
S01E02R02: 250 clips (using S01E02R02_FC.csv)
S01E02R03: 362 clips (using S01E02R03_FC.csv)
S01E02R04: 354 clips (using S01E02R04_FC.csv)
S01E02R05: 295 clips (using S01E02R05_FC.csv)
S01E02R06: 218 clips (using S01E02R06_FC.csv)
S01E02R07: 294 clips (using S01E02R07_FC.csv)

Manual count total: 2086
Expected total: 2079
Discrepancy: 7

Removal log shows: 2086 total clips

CHECKING ACTUAL ALGORITHM OUTPUT
S01E02R01: Processing 313 clips from S01E02R01_FC.csv
S01E02R02: Processing 250 clips from S01E02R02_FC.csv
S01E02R03: Processing 362 clips from S01E02R03_FC.csv
S01E02R04: Processing 354 clips from S01E02R04_FC.csv
S01E02R05: Processing 295 clips from S01E02R05_FC.csv
S01E02R06: Processing 218 clips from S01E02R06_FC.csv
S01E02R07: Processing 294 clips from S01E02R07_FC.csv

Algorithm produced: 2086 clips
Expected: 2079
Discrepancy: 7

❌ Found the issue! The algorithm is producing 2086

In [None]:
# Comprehensive analysis of episode 3 data
print("="*60)
print("EPISODE 3 DATA ANALYSIS")
print("="*60)

print(f"DataFrame shape: {episode3_df.shape}")
print(f"Columns: {list(episode3_df.columns)}")
print(f"Data types:\n{episode3_df.dtypes}")

print("\n" + "="*40)
print("SAMPLE DATA (first 10 rows):")
print("="*40)
print(episode3_df.head(10))

print("\n" + "="*40)
print("INDEX COLUMN ANALYSIS:")
print("="*40)
print(f"Index column type: {type(episode3_df['index'].iloc[0])}")
print(f"Index unique values (first 20): {episode3_df['index'].unique()[:20]}")
print(f"Any null values in index: {episode3_df['index'].isnull().sum()}")

# Try the index processing that caused issues
print("\n" + "="*40)
print("INDEX PROCESSING TEST:")
print("="*40)
try:
    index_as_str = episode3_df['index'].astype(str)
    print(f"Index as string (first 10): {index_as_str.head(10).tolist()}")
    
    last_4_chars = index_as_str.str[-4:]
    print(f"Last 4 characters (first 10): {last_4_chars.head(10).tolist()}")
    
    last_4_as_int = last_4_chars.astype(int)
    print(f"Last 4 as int (first 10): {last_4_as_int.head(10).tolist()}")
    print("✅ Index processing successful")
except Exception as e:
    print(f"❌ Index processing failed: {e}")
    print(f"Error type: {type(e).__name__}")

print("\n" + "="*40)
print("EMOTION COLUMNS ANALYSIS:")
print("="*40)
val_cols = [col for col in episode3_df.columns if 'valence' in col.lower()]
aro_cols = [col for col in episode3_df.columns if 'arousal' in col.lower()]

print(f"Valence columns: {val_cols}")
print(f"Arousal columns: {aro_cols}")

for col in val_cols + aro_cols:
    print(f"\n{col}:")
    print(f"  Type: {episode3_df[col].dtype}")
    print(f"  Unique values: {sorted(episode3_df[col].unique())}")
    print(f"  Null count: {episode3_df[col].isnull().sum()}")
    print(f"  Sample values: {episode3_df[col].head(10).tolist()}")

print("\n" + "="*40)
print("RUN COLUMN ANALYSIS:")
print("="*40)
if 'run' in episode3_df.columns:
    print(f"Run column unique values: {episode3_df['run'].unique()}")
    print(f"Run value counts:\n{episode3_df['run'].value_counts()}")
else:
    print("No 'run' column found")

print("\n" + "="*40)
print("DATA QUALITY CHECKS:")
print("="*40)
print(f"Any completely empty rows: {episode3_df.isnull().all(axis=1).sum()}")
print(f"Any duplicate rows: {episode3_df.duplicated().sum()}")

# Check for problematic characters or mixed types
print(f"\nChecking for mixed data types in emotion columns:")
for col in val_cols + aro_cols:
    try:
        numeric_version = pd.to_numeric(episode3_df[col], errors='coerce')
        nan_count = numeric_version.isnull().sum()
        original_nan_count = episode3_df[col].isnull().sum()
        conversion_issues = nan_count - original_nan_count
        if conversion_issues > 0:
            print(f"  {col}: {conversion_issues} values couldn't convert to numeric")
            # Show the problematic values
            mask = pd.to_numeric(episode3_df[col], errors='coerce').isnull() & episode3_df[col].notnull()
            problematic_values = episode3_df.loc[mask, col].unique()
            print(f"    Problematic values: {problematic_values}")
        else:
            print(f"  {col}: ✅ All values convert to numeric properly")
    except Exception as e:
        print(f"  {col}: ❌ Error checking numeric conversion: {e}")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

EPISODE 3 DATA ANALYSIS
DataFrame shape: (1921, 6)
Columns: ['run', 'index', 'valence_1', 'valence_2', 'arousal_1', 'arousal_2']
Data types:
run           object
index         object
valence_1    float64
valence_2      int64
arousal_1    float64
arousal_2      int64
dtype: object

SAMPLE DATA (first 10 rows):
         run               index  valence_1  valence_2  arousal_1  arousal_2
0  S01E03R01  S01E03R01_clip0000        3.0          5        3.0          4
1  S01E03R01  S01E03R01_clip0001        3.0          5        3.0          4
2  S01E03R01  S01E03R01_clip0002        3.0          5        3.0          4
3  S01E03R01  S01E03R01_clip0003        1.0          5        7.0          5
4  S01E03R01  S01E03R01_clip0004        1.0          5        7.0          5
5  S01E03R01  S01E03R01_clip0005        1.0          5        7.0          5
6  S01E03R01  S01E03R01_clip0006        1.0          5        5.0          5
7  S01E03R01  S01E03R01_clip0007        1.0          5        5.0        

In [None]:
# Debug the episode clip counting issue
import re
from pathlib import Path
import pandas as pd

ANNOT_DIR = Path("dset/annotations")

print("="*60)
print("DEBUGGING EPISODE 2 CLIP COUNT DISCREPANCY")
print("="*60)

# Find all S01E02 files
s01e02_files = [f for f in ANNOT_DIR.iterdir() if f.name.startswith("S01E02") and f.name.endswith("_FC.csv")]
s01e02_files.sort()

print(f"Found {len(s01e02_files)} S01E02 FC files:")
total_clips_actual = 0

for file_path in s01e02_files:
    df = pd.read_csv(file_path)
    clip_count = len(df)  # Including header initially
    clip_count_no_header = len(df) # This is the data rows (pandas already excludes header)
    
    # Extract run number
    run_match = re.search(r'R(\d+)', file_path.name)
    run_num = int(run_match.group(1)) if run_match else 0
    
    print(f"Run {run_num:2d} ({file_path.name}): {clip_count_no_header:3d} clips")
    total_clips_actual += clip_count_no_header

print(f"\nACTUAL TOTAL: {total_clips_actual} clips")

# Now check what the current processing code would produce
print(f"\n" + "="*40)
print("CHECKING CURRENT PROCESSING LOGIC")
print("="*40)

# Replicate the logic from combine_group function
groups = {}
pattern = re.compile(r"^(S\d+E\d+R\d+)_([A-Za-z]{2})\.csv$")

for p in sorted(ANNOT_DIR.iterdir()):
    if not p.is_file():
        continue
    m = pattern.match(p.name)
    if not m:
        continue
    prefix = m.group(1)
    annot = m.group(2)
    groups.setdefault(prefix, []).append((annot, p))

# Focus on S01E02 groups
s01e02_groups = {k: v for k, v in groups.items() if k.startswith("S01E02")}

print(f"Found {len(s01e02_groups)} S01E02 run groups:")
for prefix, files in sorted(s01e02_groups.items()):
    print(f"  {prefix}: {len(files)} files")

# Simulate the episode position calculation logic
episode_all_clips = []
episode = "S01E02"

for prefix in sorted(s01e02_groups.keys()):
    files = s01e02_groups[prefix]
    if files:
        # Use the first file from each run (as in current code)
        first_file = sorted(files)[0][1]  # Get the first file path for this run
        run_match = re.search(r'R(\d+)', prefix)
        run_num = int(run_match.group(1)) if run_match else 0
        
        original_df = pd.read_csv(first_file)
        print(f"  {prefix} (using {first_file.name}): {len(original_df)} clips")
        
        # Process each clip index from original file (as in current code)
        for idx, row in original_df.iterrows():
            clip_index = row[original_df.columns[0]]  # First column should be index
            try:
                if isinstance(clip_index, str) and '_clip' in clip_index:
                    clip_num_str = clip_index.split('_clip')[-1]
                    clip_number = int(clip_num_str)
                else:
                    clip_number = idx
            except (ValueError, IndexError, AttributeError):
                clip_number = idx
            
            episode_all_clips.append({
                'run_prefix': prefix,
                'run_num': run_num,
                'index': clip_index,
                'clip_number': clip_number
            })

print(f"\nPROCESSED TOTAL: {len(episode_all_clips)} clips")
print(f"DISCREPANCY: {len(episode_all_clips) - total_clips_actual}")

if len(episode_all_clips) != total_clips_actual:
    print(f"\n❌ FOUND THE ISSUE: Processed count doesn't match actual count")
    print("This suggests the processing logic is incorrect.")
else:
    print(f"\n✅ Processing logic is correct")

# Show first few clips from each run to debug
print(f"\n" + "="*40)
print("SAMPLE CLIPS BY RUN")
print("="*40)
episode_df = pd.DataFrame(episode_all_clips)
for run_num in sorted(episode_df['run_num'].unique()):
    run_clips = episode_df[episode_df['run_num'] == run_num]
    print(f"Run {run_num}: {len(run_clips)} clips")
    print(f"  First: {run_clips.iloc[0]['index']}")
    print(f"  Last:  {run_clips.iloc[-1]['index']}")