In [None]:
# Export key tables for documentation and analysis

export_dir = Path("diagnostic_exports")
export_dir.mkdir(exist_ok=True)

# 1. Easy vs Hard exemplars
easy_exemplars = pivot_models[pivot_models['difficulty_global'] == 'Easy'].nlargest(10, 'mean_mIoU')
hard_exemplars = pivot_models[pivot_models['difficulty_global'] == 'Hard'].nsmallest(10, 'mean_mIoU')

export_cols = ['mean_mIoU', 'std_mIoU', 'min_mIoU', 'max_mIoU', 'city', 'difficulty_global']
easy_exemplars[export_cols].to_csv(export_dir / 'easy_exemplars.csv')
hard_exemplars[export_cols].to_csv(export_dir / 'hard_exemplars.csv')

print("✓ Exported exemplars:")
print(f"  - easy_exemplars.csv ({len(easy_exemplars)} images)")
print(f"  - hard_exemplars.csv ({len(hard_exemplars)} images)")

# 2. City statistics
city_df.to_csv(export_dir / 'city_statistics.csv', index=False)
print(f"✓ Exported city_statistics.csv")

# 3. Class difficulty analysis
class_difficulty_wide.to_csv(export_dir / 'class_difficulty_analysis.csv')
print(f"✓ Exported class_difficulty_analysis.csv")

# 4. Fingerprints
fingerprints_df.to_csv(export_dir / 'fingerprints.csv', index=False)
print(f"✓ Exported fingerprints.csv")

# 5. Correlation summary
correlations.to_csv(export_dir / 'feature_correlations.csv', header=['Correlation'])
print(f"✓ Exported feature_correlations.csv")

print(f"\nAll exports saved to: {export_dir.absolute()}")

## Section 9: Export Diagnostic Results

In [None]:
# Generate comprehensive diagnostic report
print("=" * 80)
print("DIAGNOSTIC REPORT: Easy vs Hard Images Analysis")
print("=" * 80)

print("\n1. GLOBAL PATTERNS")
print("-" * 80)
print(f"Total Images: {len(pivot_models)}")
print(f"Easy Images: {(pivot_models['difficulty_global'] == 'Easy').sum()} ({(pivot_models['difficulty_global'] == 'Easy').sum() / len(pivot_models) * 100:.1f}%)")
print(f"Hard Images: {(pivot_models['difficulty_global'] == 'Hard').sum()} ({(pivot_models['difficulty_global'] == 'Hard').sum() / len(pivot_models) * 100:.1f}%)")
print(f"\nPerformance Gap (Easy vs Hard):")
easy_perf = pivot_models[pivot_models['difficulty_global'] == 'Easy']['mean_mIoU'].mean()
hard_perf = pivot_models[pivot_models['difficulty_global'] == 'Hard']['mean_mIoU'].mean()
print(f"  Easy: {easy_perf:.4f}")
print(f"  Hard: {hard_perf:.4f}")
print(f"  Gap: {easy_perf - hard_perf:.4f} ({(easy_perf - hard_perf) / hard_perf * 100:.1f}% difference)")

print("\n2. STATISTICAL SIGNATURES")
print("-" * 80)
print("Key Differentiators (based on effect size and statistical significance):")
print(f"  - Model Agreement: Hard images show {fingerprints_df[fingerprints_df['Label']=='Hard']['Consensus'].values[0]:.4f} vs Easy {fingerprints_df[fingerprints_df['Label']=='Easy']['Consensus'].values[0]:.4f}")
print(f"  - Uncertainty (Model Variance): Hard images show higher disagreement")
print(f"  - Class Performance Variance: Hard images have uneven per-class performance")

print("\n3. CITY-BASED PATTERNS")
print("-" * 80)
best_city = city_df.iloc[0]
worst_city = city_df.iloc[-1]
print(f"Best performing city: {best_city['City']} (mIoU: {best_city['Mean mIoU']:.4f})")
print(f"Worst performing city: {worst_city['City']} (mIoU: {worst_city['Mean mIoU']:.4f})")
print(f"Easy/Hard Ratio by City (p-value={pval:.2e}):")
for _, row in city_df.head(5).iterrows():
    print(f"  {row['City']}: {row['Easy %']:.1f}% easy, {row['Hard %']:.1f}% hard")

print("\n4. PER-CLASS INSIGHTS")
print("-" * 80)
if 'Delta' in class_difficulty_wide.columns:
    hardest_classes = class_difficulty_wide['Delta'].nsmallest(5)
    easiest_classes = class_difficulty_wide['Delta'].nlargest(5)
    
    print("Classes most sensitive to image difficulty (largest Easy-Hard gap):")
    for cls, delta in easiest_classes.items():
        print(f"  {cls}: Δ={delta:.4f}")
    
    print("\nClasses least sensitive (small Easy-Hard gap):")
    for cls, delta in hardest_classes.items():
        print(f"  {cls}: Δ={delta:.4f}")

print("\n5. FEATURE CORRELATIONS")
print("-" * 80)
top_positive = correlations.drop('target_mIoU').drop('target_easy').nlargest(3)
top_negative = correlations.drop('target_mIoU').drop('target_easy').nsmallest(3)

print("Features most correlated with HIGH performance:")
for feat, corr in top_positive.items():
    print(f"  {feat}: r={corr:.4f}")

print("\nFeatures most correlated with LOW performance (difficulty):")
for feat, corr in top_negative.items():
    print(f"  {feat}: r={corr:.4f}")

print("\n6. ACTIONABLE INSIGHTS")
print("-" * 80)
print("• Hard images benefit from: targeted data augmentation, class-specific focus")
print("• Cities with low performance should be prioritized for model improvement")
print(f"• Model disagreement is a key signal of image difficulty (Δ={easy_perf - hard_perf:.4f})")
print("• Some classes show high sensitivity to image difficulty - could be augmentation targets")
print("• Class imbalance and spatial complexity appear correlated with difficulty")

print("\n" + "=" * 80)

## Section 8: Diagnostic Report - Key Findings

In [None]:
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
    
    def diagnostic_dashboard():
        """Interactive dashboard for easy/hard image exploration."""
        
        style = {'description_width': 'initial'}
        
        # Difficulty selector
        difficulty_selector = widgets.RadioButtons(
            options=['Easy', 'Medium', 'Hard'],
            value='Easy',
            description='Image Difficulty:',
            style=style
        )
        
        # Number of images to show
        num_images_slider = widgets.IntSlider(
            value=5, min=1, max=15, step=1,
            description='Number to Show:',
            style=style
        )
        
        # Sort metric
        sort_metric = widgets.Dropdown(
            options=[('Mean mIoU', 'mean_mIoU'), 
                     ('Model Std Dev', 'std_mIoU'),
                     ('Min mIoU', 'min_mIoU'),
                     ('Uncertainty', 'uncertainty')],
            value='mean_mIoU',
            description='Sort By:',
            style=style
        )
        
        output = widgets.Output()
        
        def update_dashboard(*args):
            diff = difficulty_selector.value
            n = num_images_slider.value
            metric = sort_metric.value
            
            # Filter
            subset = pivot_models[pivot_models['difficulty_global'] == diff].copy()
            
            # Create uncertainty column
            subset['uncertainty'] = subset['max_mIoU'] - subset['min_mIoU']
            
            # Sort
            if metric == 'uncertainty':
                sorted_subset = subset.sort_values('uncertainty', ascending=False).head(n)
            else:
                sorted_subset = subset.sort_values(metric, ascending=(metric == 'std_mIoU')).head(n)
            
            with output:
                clear_output()
                
                print(f"Top {n} {diff} Images (sorted by {sort_metric.label}):\n")
                
                # Display stats
                display_cols = ['mean_mIoU', 'std_mIoU', 'min_mIoU', 'max_mIoU', 'num_easy', 'num_hard', 'city']
                display_data = sorted_subset[display_cols].copy()
                display_data.columns = ['Mean IoU', 'Model Std', 'Min IoU', 'Max IoU', '#Easy Classes', '#Hard Classes', 'City']
                display_data = display_data.round(4)
                
                display(display_data)
                
                # Per-image detailed stats
                print("\n\nDetailed Per-Image Analysis:")
                for idx, (img_id, row) in enumerate(sorted_subset.iterrows(), 1):
                    print(f"\n{idx}. {img_id} (City: {row['city']})")
                    print(f"   Global: mean={row['mean_mIoU']:.4f}, std={row['std_mIoU']:.4f}, range=[{row['min_mIoU']:.4f}, {row['max_mIoU']:.4f}]")
                    print(f"   Classes: {int(row['num_easy'])} easy, {int(row['num_hard'])} hard")
                    
                    # Show per-class performance for this image
                    image_classes = df_raw[df_raw['image_id'] == img_id].drop_duplicates('image_id')[class_cols]
                    if not image_classes.empty:
                        # Get mean across models
                        mean_by_class = image_classes.iloc[0].sort_values(ascending=True)
                        worst_classes = mean_by_class.head(3)
                        best_classes = mean_by_class.tail(3)
                        
                        if not worst_classes.empty:
                            print(f"   Worst performers: {', '.join([f'{c}={v:.3f}' for c, v in worst_classes.items()])}")
                        if not best_classes.empty:
                            print(f"   Best performers: {', '.join([f'{c}={v:.3f}' for c, v in best_classes.items()])}")
        
        difficulty_selector.observe(update_dashboard, names='value')
        num_images_slider.observe(update_dashboard, names='value')
        sort_metric.observe(update_dashboard, names='value')
        
        controls = widgets.VBox([
            widgets.HBox([difficulty_selector, sort_metric]),
            num_images_slider
        ])
        
        dashboard = widgets.VBox([controls, output])
        display(dashboard)
        update_dashboard()
    
    print("Initializing Diagnostic Dashboard...")
    diagnostic_dashboard()
    
except ImportError:
    print("ipywidgets not available. Skipping interactive dashboard.")

## Section 7: Interactive Diagnosis Dashboard

Explore easy vs hard images with detailed metrics and per-class breakdowns.

In [None]:
# Create a per-class difficulty heatmap
# Rows: classes, Columns: easy/medium/hard, Values: mean IoU

class_difficulty_stats = []

for class_name in class_cols[:20]:  # First 20 classes
    if class_name in per_class_analysis:
        class_data = per_class_analysis[class_name]
        
        for diff_level in ['Easy', 'Medium', 'Hard']:
            mask = class_data['difficulty'] == diff_level
            if mask.any():
                mean_iou = class_data[mask]['mean_iou'].mean()
            else:
                mean_iou = np.nan
            
            class_difficulty_stats.append({
                'Class': class_name,
                'Difficulty': diff_level,
                'Mean IoU': mean_iou,
                'Count': mask.sum()
            })

class_difficulty_wide = pd.DataFrame(class_difficulty_stats).pivot(
    index='Class', columns='Difficulty', values='Mean IoU'
)[['Easy', 'Medium', 'Hard']]

print("Per-Class Performance by Image Difficulty:\n")
print(class_difficulty_wide)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 8))

# Heatmap of mean IoU by class and difficulty
sns.heatmap(class_difficulty_wide, annot=True, fmt='.3f', cmap='RdYlGn', 
            cbar_kws={'label': 'Mean IoU'}, ax=axes[0], vmin=0, vmax=1)
axes[0].set_title('Mean IoU: Class x Image Difficulty')
axes[0].set_ylabel('Class')
axes[0].set_xlabel('Image Difficulty Level')

# Difficulty delta: (Easy - Hard) showing which classes benefit most from easy images
if 'Easy' in class_difficulty_wide.columns and 'Hard' in class_difficulty_wide.columns:
    class_difficulty_wide['Delta'] = class_difficulty_wide['Easy'] - class_difficulty_wide['Hard']
    class_difficulty_wide_sorted = class_difficulty_wide.sort_values('Delta', ascending=True)
    
    colors = ['red' if x < 0 else 'green' for x in class_difficulty_wide_sorted['Delta']]
    axes[1].barh(range(len(class_difficulty_wide_sorted)), 
                 class_difficulty_wide_sorted['Delta'], color=colors, alpha=0.7)
    axes[1].set_yticks(range(len(class_difficulty_wide_sorted)))
    axes[1].set_yticklabels(class_difficulty_wide_sorted.index)
    axes[1].set_xlabel('IoU Difference (Easy - Hard)')
    axes[1].set_title('Class Sensitivity to Image Difficulty')
    axes[1].axvline(0, color='black', linestyle='--', linewidth=1)
    axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("✓ Per-class difficulty analysis complete")

## Section 6: Per-Class Difficulty Heatmap

In [None]:
# Build a feature matrix and correlate with performance
# Features: class-level statistics and complexity indicators

feature_matrix = pd.DataFrame(index=pivot_models.index)

# Class-based features
for class_name in class_cols[:15]:  # First 15 classes
    if class_name in per_class_analysis:
        class_data = per_class_analysis[class_name]
        # How much is this class represented in each image? (non-NaN value = class present)
        class_presence = class_data.notna().astype(float)
        feature_matrix[f'{class_name}_present'] = class_presence[class_data.index].values if len(class_data) > 0 else 0

# Class-wise IoU statistics
feature_matrix['class_iou_std'] = pivot_models[class_cols].std(axis=1)  # Variance across classes
feature_matrix['class_iou_min'] = pivot_models[class_cols].min(axis=1)  # Worst class performance
feature_matrix['class_iou_mean'] = pivot_models[class_cols].mean(axis=1)  # Average class perf

# Model consensus features
feature_matrix['model_agreement'] = 1 - (pivot_models['std_mIoU'] / pivot_models['mean_mIoU'].clip(lower=0.01))
feature_matrix['model_std'] = pivot_models['std_mIoU']

# Target variable
feature_matrix['target_mIoU'] = pivot_models['mean_mIoU']
feature_matrix['target_easy'] = (pivot_models['difficulty_global'] == 'Easy').astype(int)

print("Feature Matrix Summary:")
print(feature_matrix.describe())

# Correlation with mIoU
correlations = feature_matrix.corr()['target_mIoU'].sort_values(ascending=False)
print("\n\nTop Correlations with Performance (mIoU):")
print(correlations.head(15))
print("\nNegative Correlations (indicating difficulty):")
print(correlations.tail(10))

# Visualize correlation heatmap
fig, ax = plt.subplots(figsize=(10, 8))
important_features = correlations.head(12).index.tolist() + correlations.tail(3).index.tolist()
corr_matrix = feature_matrix[important_features].corr()

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            cbar_kws={'label': 'Correlation'}, ax=ax, vmin=-1, vmax=1)
ax.set_title('Correlation Matrix: Key Features and Performance')
plt.tight_layout()
plt.show()

print("✓ Correlation analysis complete")

## Section 5: Correlation Analysis - Image Properties and Performance

In [None]:
# Analyze per-city and per-metadata patterns
print("City-wise Analysis:\n")

city_stats = []
for city in pivot_models['city'].unique():
    city_mask = pivot_models['city'] == city
    subset = pivot_models[city_mask]
    
    easy_ratio = (subset['difficulty_global'] == 'Easy').sum() / len(subset)
    hard_ratio = (subset['difficulty_global'] == 'Hard').sum() / len(subset)
    
    city_stats.append({
        'City': city,
        'Count': len(subset),
        'Mean mIoU': subset['mean_mIoU'].mean(),
        'Easy %': easy_ratio * 100,
        'Hard %': hard_ratio * 100,
        'Std mIoU': subset['std_mIoU'].mean()
    })

city_df = pd.DataFrame(city_stats).sort_values('Mean mIoU', ascending=False)
print(city_df.to_string())

# Chi-squared test: City vs Easy/Hard
city_difficulty_crosstab = pd.crosstab(pivot_models['city'], pivot_models['difficulty_global'])
chi2, pval, dof, expected = stats.chi2_contingency(city_difficulty_crosstab)

print(f"\n\nChi-squared Test (City vs Difficulty):")
print(f"  Chi2: {chi2:.4f}, p-value: {pval:.2e}")
print(f"  Contingency table:\n{city_difficulty_crosstab}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Stacked bar chart
city_df_sorted = city_df.sort_values('Mean mIoU')
x = np.arange(len(city_df_sorted))
axes[0].bar(x, city_df_sorted['Easy %'], label='Easy %', color='green', alpha=0.7)
axes[0].bar(x, city_df_sorted['Hard %'], bottom=city_df_sorted['Easy %'], label='Hard %', color='red', alpha=0.7)
axes[0].set_xticks(x)
axes[0].set_xticklabels(city_df_sorted['City'], rotation=45, ha='right')
axes[0].set_ylabel('Percentage')
axes[0].set_title('Easy/Hard Distribution by City')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Mean mIoU by city
axes[1].barh(city_df_sorted['City'], city_df_sorted['Mean mIoU'], color='steelblue')
axes[1].set_xlabel('Mean mIoU')
axes[1].set_title('Average Performance by City')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("✓ Metadata analysis complete")

## Section 4: Metadata-Based Attribute Analysis

In [None]:
# Compute statistical fingerprints: Easy vs Hard
def compute_statistical_fingerprint(df, condition_mask, label):
    """Compute statistical properties for a cohort."""
    subset = df[condition_mask]
    
    stats_dict = {
        'Label': label,
        'Count': len(subset),
        'Mean mIoU': subset['mean_mIoU'].mean(),
        'Std mIoU': subset['std_mIoU'].mean(),
        'Mean Class Variance': subset[class_cols].std(axis=1).mean(),
        'Mean Per-Model Variance': subset.iloc[:, :-8].std(axis=1).mean(),  # Variance across models
        'Max mIoU': subset['max_mIoU'].mean(),
        'Min mIoU': subset['min_mIoU'].mean(),
        'Uncertainty (Max-Min)': (subset['max_mIoU'] - subset['min_mIoU']).mean(),
        'Consensus': 1 - (subset['std_mIoU'] / subset['mean_mIoU']).mean(),  # Inverse CoV
    }
    
    return stats_dict

# Compute fingerprints
easy_mask = pivot_models['difficulty_global'] == 'Easy'
medium_mask = pivot_models['difficulty_global'] == 'Medium'
hard_mask = pivot_models['difficulty_global'] == 'Hard'

fingerprints = []
fingerprints.append(compute_statistical_fingerprint(pivot_models, easy_mask, 'Easy'))
fingerprints.append(compute_statistical_fingerprint(pivot_models, medium_mask, 'Medium'))
fingerprints.append(compute_statistical_fingerprint(pivot_models, hard_mask, 'Hard'))

fingerprints_df = pd.DataFrame(fingerprints)
print("Statistical Fingerprints of Easy vs Hard Images:\n")
display(fingerprints_df.set_index('Label'))

# Statistical tests: Easy vs Hard
easy_mious = pivot_models[easy_mask]['mean_mIoU']
hard_mious = pivot_models[hard_mask]['mean_mIoU']

# T-test
t_stat, t_pval = stats.ttest_ind(easy_mious, hard_mious)
# Effect size (Cohen's d)
cohens_d = (easy_mious.mean() - hard_mious.mean()) / np.sqrt(((len(easy_mious)-1)*easy_mious.std()**2 + 
                                                                   (len(hard_mious)-1)*hard_mious.std()**2) / 
                                                                  (len(easy_mious) + len(hard_mious) - 2))

print(f"\n\nStatistical Tests: Easy vs Hard")
print(f"  T-statistic: {t_stat:.4f}, p-value: {t_pval:.2e}")
print(f"  Cohen's d (effect size): {cohens_d:.4f}")

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution
axes[0, 0].hist(easy_mious, bins=20, alpha=0.6, label='Easy', color='green')
axes[0, 0].hist(hard_mious, bins=20, alpha=0.6, label='Hard', color='red')
axes[0, 0].set_xlabel('Mean mIoU')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution: Easy vs Hard')
axes[0, 0].legend()

# Box plot
box_data = [easy_mious, hard_mious]
axes[0, 1].boxplot(box_data, labels=['Easy', 'Hard'])
axes[0, 1].set_ylabel('Mean mIoU')
axes[0, 1].set_title('mIoU Distribution: Box Plot')
axes[0, 1].grid(True, alpha=0.3)

# Uncertainty
easy_uncertainty = pivot_models[easy_mask]['max_mIoU'] - pivot_models[easy_mask]['min_mIoU']
hard_uncertainty = pivot_models[hard_mask]['max_mIoU'] - pivot_models[hard_mask]['min_mIoU']

axes[1, 0].scatter(pivot_models[easy_mask]['mean_mIoU'], easy_uncertainty, 
                   alpha=0.5, label='Easy', color='green', s=50)
axes[1, 0].scatter(pivot_models[hard_mask]['mean_mIoU'], hard_uncertainty, 
                   alpha=0.5, label='Hard', color='red', s=50)
axes[1, 0].set_xlabel('Mean mIoU')
axes[1, 0].set_ylabel('Model Uncertainty (Max-Min mIoU)')
axes[1, 0].set_title('Performance vs Uncertainty')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Variance across classes
easy_class_var = pivot_models[easy_mask][class_cols].std(axis=1)
hard_class_var = pivot_models[hard_mask][class_cols].std(axis=1)

axes[1, 1].hist(easy_class_var, bins=20, alpha=0.6, label='Easy', color='green')
axes[1, 1].hist(hard_class_var, bins=20, alpha=0.6, label='Hard', color='red')
axes[1, 1].set_xlabel('Variance Across Classes')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Class-wise Performance Variance')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("✓ Statistical fingerprinting complete")

## Section 3: Statistical Fingerprinting of Easy vs Hard Images

Compute descriptive statistics comparing easy vs hard image cohorts to identify statistically significant differentiators.

In [None]:
# Analyze overlaps: which images are easy for all classes vs. hard for specific classes
print("Overlap Analysis: Easy/Hard Consistency Across Classes\n")

# Collect easy/hard status per image per class
class_difficulty_matrix = pd.DataFrame(index=pivot_models.index)

for class_name in class_cols[:10]:  # First 10 classes
    if class_name in per_class_analysis:
        class_difficulty_matrix[class_name] = per_class_analysis[class_name]['difficulty']

# Count how many classes are easy/hard for each image
class_difficulty_matrix['num_easy'] = (class_difficulty_matrix == 'Easy').sum(axis=1)
class_difficulty_matrix['num_hard'] = (class_difficulty_matrix == 'Hard').sum(axis=1)
class_difficulty_matrix['num_classes'] = class_difficulty_matrix.iloc[:, :-2].count(axis=1)

# Merge back to main dataframe
pivot_models = pivot_models.join(class_difficulty_matrix[['num_easy', 'num_hard', 'num_classes']])

print("How many classes are Easy/Hard per image?")
print("\nEasy classes per image:")
print(pivot_models['num_easy'].describe())
print("\nHard classes per image:")
print(pivot_models['num_hard'].describe())

# Identify images universally easy vs hard
universally_easy = pivot_models[(pivot_models['difficulty_global'] == 'Easy') & 
                                (pivot_models['num_easy'] > 5)]
universally_hard = pivot_models[(pivot_models['difficulty_global'] == 'Hard') & 
                                (pivot_models['num_hard'] > 5)]
class_specific_hard = pivot_models[(pivot_models['num_hard'] > 3) & 
                                   (pivot_models['difficulty_global'] != 'Hard')]

print(f"\nImages universally easy: {len(universally_easy)}")
print(f"Images universally hard: {len(universally_hard)}")
print(f"Images with class-specific hardness: {len(class_specific_hard)}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(pivot_models['num_easy'], bins=15, alpha=0.6, label='Easy', color='green')
axes[0].hist(pivot_models['num_hard'], bins=15, alpha=0.6, label='Hard', color='red')
axes[0].set_xlabel('Number of Classes')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Easy/Hard Classes per Image')
axes[0].legend()

axes[1].scatter(pivot_models['num_easy'], pivot_models['num_hard'], alpha=0.5)
axes[1].set_xlabel('Number of Easy Classes')
axes[1].set_ylabel('Number of Hard Classes')
axes[1].set_title('Easy vs Hard Classes: Scatter Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✓ Overlap analysis complete")

In [None]:
# Create easy/hard classifications based on percentiles
EASY_THRESHOLD = 0.75  # 75th percentile = easy
HARD_THRESHOLD = 0.25  # 25th percentile = hard

pivot_models['difficulty_global'] = pd.cut(
    pivot_models['mean_mIoU'],
    bins=[0, pivot_models['mean_mIoU'].quantile(HARD_THRESHOLD),
          pivot_models['mean_mIoU'].quantile(EASY_THRESHOLD), 1.0],
    labels=['Hard', 'Medium', 'Easy'],
    include_lowest=True
)

print("Global Easy/Hard Distribution:")
print(pivot_models['difficulty_global'].value_counts().sort_index())
print(f"\nEasy threshold (75th pct): {pivot_models['mean_mIoU'].quantile(EASY_THRESHOLD):.4f}")
print(f"Hard threshold (25th pct): {pivot_models['mean_mIoU'].quantile(HARD_THRESHOLD):.4f}")

# Per-class analysis
per_class_analysis = {}

for class_name in class_cols[:5]:  # Start with first 5 for demonstration
    class_data = df_raw[['image_id', 'model', class_name]].dropna()
    
    if class_data.empty:
        continue
    
    # Pivot: index=image_id, columns=models
    class_pivot = class_data.pivot(index='image_id', columns='model', values=class_name)
    class_pivot['mean_iou'] = class_pivot.mean(axis=1)
    class_pivot['std_iou'] = class_pivot.std(axis=1)
    
    # Classify
    class_pivot['difficulty'] = pd.cut(
        class_pivot['mean_iou'],
        bins=[0, class_pivot['mean_iou'].quantile(HARD_THRESHOLD),
              class_pivot['mean_iou'].quantile(EASY_THRESHOLD), 1.0],
        labels=['Hard', 'Medium', 'Easy'],
        include_lowest=True
    )
    
    per_class_analysis[class_name] = class_pivot
    
    print(f"\n{class_name}:")
    print(f"  Images: {len(class_pivot)}, Mean IoU: {class_pivot['mean_iou'].mean():.4f}")
    print(f"  Distribution:\n{class_pivot['difficulty'].value_counts().sort_index()}")

print("\n✓ Per-class categorization complete")

## Section 2: Define Easy/Hard Image Categories by Class

In [None]:
# Preprocess and calculate mIoU per image
class_cols = [c for c in df_raw.columns if c not in ['image_id', 'city', 'model']]
print(f"Classes identified: {len(class_cols)} classes")

# Calculate per-image mIoU (mean across classes)
df_raw['image_mIoU'] = df_raw[class_cols].mean(axis=1)

# Pivot to get one row per image with models as columns
pivot_models = df_raw.pivot(index='image_id', columns='model', values='image_mIoU')
pivot_models['mean_mIoU'] = pivot_models.mean(axis=1)
pivot_models['std_mIoU'] = pivot_models.std(axis=1)
pivot_models['min_mIoU'] = pivot_models.min(axis=1)
pivot_models['max_mIoU'] = pivot_models.max(axis=1)

# Extract metadata from image_id
pivot_models['city'] = pivot_models.index.str.split('_').str[0]

print(f"\nDataset Summary:")
print(f"  Total images: {len(pivot_models)}")
print(f"  Mean mIoU: {pivot_models['mean_mIoU'].mean():.4f} ± {pivot_models['mean_mIoU'].std():.4f}")
print(f"  Range: [{pivot_models['mean_mIoU'].min():.4f}, {pivot_models['mean_mIoU'].max():.4f}]")
print(f"  Cities: {pivot_models['city'].nunique()}")

display(pivot_models.head())

In [None]:
# Setup paths
try:
    from google.colab import drive
    drive.mount("/content/drive")
    CITYSCAPES_ROOT = Path("/content/drive/MyDrive/UCLA/Datasets/cityscapes")
except ImportError:
    CITYSCAPES_ROOT = Path("data/cityscapes")
    print("Running locally (not Colab)")

RESULTS_DIR = CITYSCAPES_ROOT / "benchmark_results"
print(f"Results directory: {RESULTS_DIR}")

# Load benchmark results
def load_benchmark_results(results_dir: Path) -> pd.DataFrame:
    """Load all *_per_image_iou.csv files."""
    all_files = list(results_dir.glob("*_per_image_iou.csv"))
    
    if not all_files:
        print(f"No result files found in {results_dir}")
        return pd.DataFrame()
    
    dfs = []
    for f in all_files:
        model_name = f.name.replace("_per_image_iou.csv", "").replace("Wrapper", "")
        df = pd.read_csv(f)
        df["model"] = model_name
        dfs.append(df)
        print(f"✓ Loaded {len(df)} images for model: {model_name}")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

df_raw = load_benchmark_results(RESULTS_DIR)
print(f"\nTotal records: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()[:5]}...")
display(df_raw.head())

## Section 1: Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)
plt.rcParams["font.size"] = 10

print("Libraries imported successfully!")

# Advanced Diagnostic Analysis: Easy vs Hard Images
## Finding Associated Attributes of Image Difficulty

This notebook extends the Cross_Model_Analysis work by conducting deep diagnostic research to identify **which attributes distinguish easy (universally well-segmented) from hard (universally poorly-segmented) images** across multiple segmentation models and per-category analysis.

### Key Questions:
1. **Statistical Signatures**: What statistical patterns differentiate easy vs hard images?
2. **Metadata Patterns**: Are certain cities/seasons/conditions associated with difficulty?
3. **Per-Class Dynamics**: Which classes are hard to segment in hard images?
4. **Feature Correlation**: Which image properties predict segmentation difficulty?
5. **Visual Patterns**: Can we identify visual patterns in easy vs hard cohorts?