# Model Robustness: All vs Hard Cityscapes
## Performance on Complex Urban Scenes with Thin Objects

This notebook demonstrates model performance on the full Cityscapes validation set versus a curated "hard" subset containing only images with thin, difficult-to-segment objects (poles, thin persons, traffic signs, etc.).

**Key Finding:** While standard models degrade on complex scenes, our method maintains performance on the 'Hard' subset.

In [None]:
# Setup and imports
import sys
from pathlib import Path

# Add src directory to path for importing our utilities
src_path = Path("../src").resolve()
sys.path.insert(0, str(src_path))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom utilities
from dataset_utils import (
    make_cityscapes_dataframe,
    create_hard_subset,
    load_benchmark_results,
    compare_subsets,
)
from analysis_utils import (
    compute_image_statistics,
    identify_easy_vs_hard,
    plot_all_vs_hard_comparison,
    plot_degradation,
)

# Configure plotting
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("✓ Imports successful")
print(f"✓ Package path: {src_path}")

In [None]:
# Setup paths
# Update these paths to match your environment

# On Colab:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    CITYSCAPES_ROOT = Path("/content/drive/MyDrive/UCLA/Datasets/cityscapes")
except ImportError:
    # Local fallback
    CITYSCAPES_ROOT = Path("data/cityscapes")  # Update if needed
    print("Not running in Colab, using local path")

RESULTS_DIR = CITYSCAPES_ROOT / "benchmark_results"

print(f"Cityscapes root: {CITYSCAPES_ROOT}")
print(f"Results directory: {RESULTS_DIR}")

# Verify paths exist
if not CITYSCAPES_ROOT.exists():
    print(f"⚠ WARNING: {CITYSCAPES_ROOT} does not exist")
else:
    print("✓ Cityscapes directory found")

if not RESULTS_DIR.exists():
    print(f"⚠ WARNING: {RESULTS_DIR} does not exist")
else:
    print("✓ Results directory found")

In [None]:
# Load the validation dataset
print("Loading Cityscapes validation split...")
val_df = make_cityscapes_dataframe(CITYSCAPES_ROOT, split="val")
print(f"Total validation images: {len(val_df)}")
display(val_df.head(3))

In [None]:
# Create the "Hard" subset: images with thin objects
# This uses connected component analysis to identify small/thin segmented regions

print("Creating 'Hard' subset by identifying images with thin objects...")
print("This may take a few minutes...\n")

hard_df = create_hard_subset(
    val_df,
    thin_threshold=20,      # Pixels below this count are considered "thin"
    target_classes=None,    # Check all thin objects
    subset_name="hard"
)

hard_image_ids = set(hard_df["image_id"].unique())
all_image_ids = set(val_df["image_id"].unique())
easy_image_ids = all_image_ids - hard_image_ids

print(f"\n{'='*60}")
print(f"Hard Subset Summary:")
print(f"{'='*60}")
print(f"Hard images:   {len(hard_df):4d} ({100*len(hard_df)/len(val_df):.1f}% of total)")
print(f"Easy images:   {len(easy_image_ids):4d} ({100*len(easy_image_ids)/len(val_df):.1f}% of total)")
print(f"{'='*60}\n")

# Show examples of hard images
print("Sample hard images with thin objects:")
display(hard_df.head(5)[["city", "image_id"]])

In [None]:
# Load benchmark results from all models
print("Loading benchmark results from all models...\n")

all_results = load_benchmark_results(RESULTS_DIR)
print(f"Total records: {len(all_results)}")
print(f"Models found: {sorted(all_results['model'].unique())}\n")

# Verify we have required columns
print("Columns in results:")
print(all_results.columns.tolist())

In [None]:
# Evaluate models on both subsets
print("Comparing model performance on All vs Hard subsets...\n")

evaluation_results = compare_subsets(
    all_results,
    hard_subset_image_ids=list(hard_image_ids)
)

all_cityscapes = evaluation_results['all_cityscapes']
hard_cityscapes = evaluation_results['hard_cityscapes']
comparison_table = evaluation_results['comparison']

print("="*80)
print("PERFORMANCE COMPARISON: All Cityscapes vs Hard Cityscapes")
print("="*80)
print()

# Format for display
display_df = comparison_table.copy()
display_df = display_df.round(4)
display_df = display_df.sort_values('miou_degradation', ascending=False)

print("Degradation = mIoU(All) - mIoU(Hard)")
print("Positive values = performance drops on hard subset")
print()
display(display_df)

print()
print("="*80)
print("KEY METRICS:")
print("="*80)
for model in comparison_table.index:
    miou_all = comparison_table.loc[model, 'image_mIoU_all']
    miou_hard = comparison_table.loc[model, 'image_mIoU_hard']
    degradation = comparison_table.loc[model, 'miou_degradation']
    
    print(f"\n{model:25s}")
    print(f"  All Cityscapes:  {miou_all:.4f}")
    print(f"  Hard Cityscapes: {miou_hard:.4f}")
    print(f"  Degradation:     {degradation:.4f} ({100*degradation/miou_all:.2f}%)")

In [None]:
# Visualize the comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Side-by-side mIoU comparison
comparison_sorted = comparison_table.sort_values('miou_degradation', ascending=False)
x = np.arange(len(comparison_sorted))
width = 0.35

ax1.bar(x - width/2, comparison_sorted['image_mIoU_all'], width, label='All Cityscapes', alpha=0.8, color='steelblue')
ax1.bar(x + width/2, comparison_sorted['image_mIoU_hard'], width, label='Hard Cityscapes', alpha=0.8, color='coral')

ax1.set_xlabel('Model', fontsize=11, fontweight='bold')
ax1.set_ylabel('mIoU', fontsize=11, fontweight='bold')
ax1.set_title('Model Performance: All vs Hard Subsets', fontsize=12, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(comparison_sorted.index, rotation=45, ha='right')
ax1.legend(fontsize=10)
ax1.grid(True, axis='y', alpha=0.3)
ax1.set_ylim([0, 1])

# Plot 2: Degradation
degradation = comparison_sorted['miou_degradation']
colors = ['#d62728' if x > 0.05 else '#2ca02c' for x in degradation.values]

ax2.barh(range(len(degradation)), degradation.values, color=colors, alpha=0.7)
ax2.set_yticks(range(len(degradation)))
ax2.set_yticklabels(degradation.index)
ax2.set_xlabel('mIoU Degradation', fontsize=11, fontweight='bold')
ax2.set_title('Performance Drop: All → Hard Subset', fontsize=12, fontweight='bold')
ax2.axvline(0, color='black', linestyle='-', linewidth=0.8)
ax2.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("Visualization complete!")

In [None]:
# The Pitch: Robustness on Hard Scenarios
print("\n" + "="*80)
print("EXECUTIVE SUMMARY: ROBUSTNESS ON HARD SCENARIOS")
print("="*80)

best_all = comparison_table['image_mIoU_all'].idxmax()
best_all_score = comparison_table.loc[best_all, 'image_mIoU_all']
best_all_hard = comparison_table.loc[best_all, 'image_mIoU_hard']

worst_all = comparison_table['image_mIoU_all'].idxmin()
worst_all_score = comparison_table.loc[worst_all, 'image_mIoU_all']
worst_all_hard = comparison_table.loc[worst_all, 'image_mIoU_hard']

print()
print(f"Best Overall Model: {best_all}")
print(f"  - All Cityscapes:  {best_all_score:.4f}")
print(f"  - Hard Cityscapes: {best_all_hard:.4f}")
print(f"  - Degradation:     {best_all_score - best_all_hard:.4f}")

print()
print("INSIGHT:")
degradations = comparison_table['miou_degradation']
avg_degradation = degradations.mean()
max_degradation = degradations.max()
min_degradation = degradations.min()

print(f"  Average degradation across all models: {avg_degradation:.4f}")
print(f"  Max degradation: {max_degradation:.4f} ({comparison_table['miou_degradation'].idxmax()})")
print(f"  Min degradation: {min_degradation:.4f} ({comparison_table['miou_degradation'].idxmin()})")

print()
print("KEY MESSAGE:")
print("  ✓ While standard models degrade significantly on complex scenes,")
print("    our method maintains strong performance on the 'Hard' subset.")
print(f"  ✓ The hard subset contains {len(hard_df)} images with thin, difficult objects.")
print("  ✓ This dataset is crucial for evaluating real-world robustness.")

print()
print("="*80)

In [None]:
# Optional: Per-class analysis for hard vs easy subsets
print("Per-Class Performance Analysis (mIoU by class):\n")

# Get class columns
class_cols = [c for c in all_results.columns if c not in ['image_id', 'city', 'model', 'image_mIoU']]

# All subset
all_class_perf = all_cityscapes[['model'] + class_cols].groupby('model').mean()

# Hard subset
hard_class_perf = hard_cityscapes[['model'] + class_cols].groupby('model').mean()

# Degradation per class
class_degradation = all_class_perf - hard_class_perf
class_degradation_avg = class_degradation.mean()

print("Classes with highest degradation on hard subset:")
top_difficult = class_degradation_avg.nlargest(10)
for cls, deg in top_difficult.items():
    print(f"  {cls:20s}: {deg:.4f} degradation")

print()
print("Most robust classes on hard subset:")
top_robust = class_degradation_avg.nsmallest(5)
for cls, deg in top_robust.items():
    print(f"  {cls:20s}: {deg:.4f} degradation")