In [None]:
%pip install matplotlib numpy

# Geo Analysis Notebook
Organized for Colab: loading paths, calculations, display helpers, file utilities, and an analysis runner.

In [None]:
import json
import shutil
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

## Loading Operations

In [None]:
def load_json(path: Path):
    """Load JSON file from a path."""
    with open(path, "r") as file:
        return json.load(file)

## Calculation Operations

In [None]:
def get_geos(summary):
    """Extract class 1 (geo) polygons from summary."""
    return [polygon for polygon in summary["polygons"] if polygon["class"] == 1]

def get_geos_sizes(geos):
    """Return indexed size and file info for geos."""
    return [
        {
            "index": geo["polygon_index"],
            "boxmeters": geo["bbox_size_meters"],
            "shape": geo["image_shape"],
            "files": geo["files"],
        }
        for geo in geos
    ]

def extract_metrics(geos_sizes):
    """Extract width, height, and area metrics."""
    widths = [geo["boxmeters"]["width_m"] for geo in geos_sizes]
    heights = [geo["boxmeters"]["height_m"] for geo in geos_sizes]
    areas = [geo["boxmeters"]["area_m2"] for geo in geos_sizes]
    return widths, heights, areas

def _get_axis_limits_with_padding(data_list, padding=0.05):
    """Return min/max limits with padding for consistent scaling."""
    flat_data = [val for sublist in data_list for val in sublist]
    data_min, data_max = min(flat_data), max(flat_data)
    padding_val = (data_max - data_min) * padding
    return data_min - padding_val, data_max + padding_val

def _remove_outliers_iqr(data, multiplier=1.5):
    """Remove outliers using the IQR method."""
    q25, q75 = np.percentile(data, [25, 75])
    iqr = q75 - q25
    lower_bound = q25 - multiplier * iqr
    upper_bound = q75 + multiplier * iqr
    return [x for x in data if lower_bound <= x <= upper_bound]

def calculate_pixel_scale(geos_sizes):
    """Calculate mean meters-per-pixel scale for geos."""
    scales = []
    for geo in geos_sizes:
        bbox_width = geo["boxmeters"]["width_m"]
        bbox_height = geo["boxmeters"]["height_m"]
        img_width = geo["shape"]["width"]
        img_height = geo["shape"]["height"]

        width_scale = bbox_width / img_width if img_width > 0 else 0
        height_scale = bbox_height / img_height if img_height > 0 else 0
        scale = (width_scale + height_scale) / 2
        scales.append(scale)

    return np.mean(scales) if scales else 0

## Display Operations

In [None]:
def print_geos_statistics(geos_sizes_list, dataset_names):
    """Print mean dimensions (outliers removed) and suggested window sizes."""
    print("\n" + "=" * 90)
    print("GEO DIMENSIONS STATISTICS (for sliding window determination)")
    print("=" * 90)

    for geos_sizes, name in zip(geos_sizes_list, dataset_names):
        widths, heights, areas = extract_metrics(geos_sizes)

        widths_no_outliers = _remove_outliers_iqr(widths)
        heights_no_outliers = _remove_outliers_iqr(heights)
        areas_no_outliers = _remove_outliers_iqr(areas)

        width_mean = np.mean(widths_no_outliers)
        height_mean = np.mean(heights_no_outliers)

        scale = calculate_pixel_scale(geos_sizes)

        window_size_m = max(int(np.ceil(width_mean)), int(np.ceil(height_mean)))
        window_size_px = int(np.ceil(window_size_m / scale)) if scale > 0 else 0

        print(f"\n{name}:")
        print(f"  Count:           {len(widths)} (outliers removed from mean calculation)")
        print(f"  Width:  avg={width_mean:.1f}m  min={min(widths):.1f}m  max={max(widths):.1f}m")
        print(f"  Height: avg={height_mean:.1f}m  min={min(heights):.1f}m  max={max(heights):.1f}m")
        print(f"  Area:   avg={np.mean(areas_no_outliers):.1f}m²  min={min(areas):.1f}m²  max={max(areas):.1f}m²")
        print(f"  Pixel scale: {scale:.4f} m/pixel")
        print(f"  Suggested sliding window: {window_size_m}m x {window_size_m}m ({window_size_px}px x {window_size_px}px)")

    print("\n" + "=" * 90)

def plot_geos_statistics(geos_sizes_list, dataset_names):
    """Display geo stats text blocks for each dataset."""
    num_datasets = len(geos_sizes_list)
    fig, axes = plt.subplots(1, num_datasets, figsize=(18, 8))
    fig.suptitle("Geo Dimensions Statistics (for Sliding Window Determination)", fontsize=16, fontweight="bold")

    if num_datasets == 1:
        axes = [axes]

    for ax, geos_sizes, name in zip(axes, geos_sizes_list, dataset_names):
        widths, heights, areas = extract_metrics(geos_sizes)

        widths_no_outliers = _remove_outliers_iqr(widths)
        heights_no_outliers = _remove_outliers_iqr(heights)
        areas_no_outliers = _remove_outliers_iqr(areas)

        width_mean = np.mean(widths_no_outliers)
        height_mean = np.mean(heights_no_outliers)

        scale = calculate_pixel_scale(geos_sizes)
        window_size_m = max(int(np.ceil(width_mean)), int(np.ceil(height_mean)))
        window_size_px = int(np.ceil(window_size_m / scale)) if scale > 0 else 0

        stats_text = f"{name}\n"
        stats_text += f"{'=' * 30}\n\n"
        stats_text += f"Count: {len(widths)}\n"
        stats_text += f"(Outliers excluded)\n\n"
        stats_text += f"Width (m):\n  Avg: {width_mean:.1f}\n  Min: {min(widths):.1f}\n  Max: {max(widths):.1f}\n\n"
        stats_text += f"Height (m):\n  Avg: {height_mean:.1f}\n  Min: {min(heights):.1f}\n  Max: {max(heights):.1f}\n\n"
        stats_text += f"Area (m²):\n  Avg: {np.mean(areas_no_outliers):.1f}\n  Min: {min(areas):.1f}\n  Max: {max(areas):.1f}\n\n"
        stats_text += f"Pixel Scale:\n  {scale:.4f} m/px\n\n"
        stats_text += f"Window Size:\n  {window_size_m}m × {window_size_m}m\n  {window_size_px}px × {window_size_px}px"

        ax.axis("off")
        ax.text(
            0.5,
            0.5,
            stats_text,
            fontsize=10,
            family="monospace",
            verticalalignment="center",
            horizontalalignment="center",
            bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.3, pad=1.5),
        )

    plt.tight_layout()
    return fig

def plot_distributions(widths, heights, areas, title="Distribution of Geo Dimensions"):
    """Box plots for widths, heights, and areas."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(title, fontsize=16)

    metrics = [(widths, "Width (meters)", 0), (heights, "Height (meters)", 1), (areas, "Area (m²)", 2)]

    for data, label, idx in metrics:
        axes[idx].boxplot(data)
        axes[idx].set_ylabel(label)
        axes[idx].set_title(f"{label} Distribution")
        axes[idx].grid(True, alpha=0.3)

    plt.tight_layout()
    return fig

def plot_histograms(widths, heights, areas, title="Distribution of Geo Dimensions", bins=20):
    """Histograms for widths, heights, and areas."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(title, fontsize=16)

    colors = ["skyblue", "lightgreen", "lightcoral"]
    metrics = [(widths, "Width (meters)", 0), (heights, "Height (meters)", 1), (areas, "Area (m²)", 2)]

    for data, label, idx in metrics:
        axes[idx].hist(data, bins=bins, color=colors[idx], edgecolor="black", alpha=0.7)
        axes[idx].set_xlabel(label)
        axes[idx].set_ylabel("Frequency")
        axes[idx].set_title(f"{label} Distribution")
        axes[idx].grid(True, alpha=0.3, axis="y")

    plt.tight_layout()
    return fig

def plot_combined_comparison(datasets, dataset_names, title="Comparison of Geo Dimensions Across Datasets"):
    """Box plots comparing distributions across datasets with shared scales."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(title, fontsize=16)

    widths_data = [extract_metrics(ds)[0] for ds in datasets]
    heights_data = [extract_metrics(ds)[1] for ds in datasets]
    areas_data = [extract_metrics(ds)[2] for ds in datasets]

    widths_min, widths_max = _get_axis_limits_with_padding(widths_data)
    heights_min, heights_max = _get_axis_limits_with_padding(heights_data)
    areas_min, areas_max = _get_axis_limits_with_padding(areas_data)

    metrics = [
        (widths_data, "Width (meters)", widths_min, widths_max, 0),
        (heights_data, "Height (meters)", heights_min, heights_max, 1),
        (areas_data, "Area (m²)", areas_min, areas_max, 2),
    ]

    for data, label, y_min, y_max, idx in metrics:
        axes[idx].boxplot(data, labels=dataset_names)
        axes[idx].set_ylabel(label)
        axes[idx].set_title(f"{label} by Dataset")
        axes[idx].set_ylim(y_min, y_max)
        axes[idx].grid(True, alpha=0.3, axis="y")

    plt.tight_layout()
    return fig

def plot_single_figure_all_datasets(all_data, dataset_names, title="Distribution Comparison", bins=15):
    """Histograms for all datasets in one figure with consistent scales."""
    num_datasets = len(all_data)
    fig, axes = plt.subplots(num_datasets, 3, figsize=(18, 5 * num_datasets))
    fig.suptitle(title, fontsize=16)

    if num_datasets == 1:
        axes = axes.reshape(1, -1)

    colors = ["skyblue", "lightgreen", "lightcoral"]

    all_widths = [width for data in all_data for width in extract_metrics(data)[0]]
    all_heights = [height for data in all_data for height in extract_metrics(data)[1]]
    all_areas = [area for data in all_data for area in extract_metrics(data)[2]]

    widths_min, widths_max = _get_axis_limits_with_padding([all_widths])
    heights_min, heights_max = _get_axis_limits_with_padding([all_heights])
    areas_min, areas_max = _get_axis_limits_with_padding([all_areas])

    for row, (geos_sizes, dataset_name) in enumerate(zip(all_data, dataset_names)):
        widths, heights, areas = extract_metrics(geos_sizes)

        axes[row, 0].hist(widths, bins=bins, color=colors[0], edgecolor="black", alpha=0.7)
        axes[row, 0].set_xlabel("Width (meters)")
        axes[row, 0].set_ylabel("Frequency")
        axes[row, 0].set_title(f"{dataset_name} - Width Distribution")
        axes[row, 0].set_xlim(widths_min, widths_max)
        axes[row, 0].grid(True, alpha=0.3, axis="y")

        axes[row, 1].hist(heights, bins=bins, color=colors[1], edgecolor="black", alpha=0.7)
        axes[row, 1].set_xlabel("Height (meters)")
        axes[row, 1].set_ylabel("Frequency")
        axes[row, 1].set_title(f"{dataset_name} - Height Distribution")
        axes[row, 1].set_xlim(heights_min, heights_max)
        axes[row, 1].grid(True, alpha=0.3, axis="y")

        axes[row, 2].hist(areas, bins=bins, color=colors[2], edgecolor="black", alpha=0.7)
        axes[row, 2].set_xlabel("Area (m²)")
        axes[row, 2].set_ylabel("Frequency")
        axes[row, 2].set_title(f"{dataset_name} - Area Distribution")
        axes[row, 2].set_xlim(areas_min, areas_max)
        axes[row, 2].grid(True, alpha=0.3, axis="y")

    plt.tight_layout()
    return fig

## File Operations

In [None]:
def copy_geos_files_by_area(summary_json_path, source_dir, output_base_dir, area_name):
    """Copy class-1 geo images and metadata into an area-specific folder."""
    summary = load_json(summary_json_path)
    geos = get_geos(summary)

    output_dir = Path(output_base_dir) / area_name
    output_dir.mkdir(parents=True, exist_ok=True)
    source_dir = Path(source_dir)

    copied_count = 0
    for geo in geos:
        polygon_index = geo["polygon_index"]

        try:
            for filename in geo["files"].values():
                source_file = source_dir / filename
                if source_file.exists():
                    shutil.copy2(source_file, output_dir / filename)

            base_name = f"geoglif_{polygon_index:04d}"
            metadata_file = source_dir / f"{base_name}_metadata.json"
            if metadata_file.exists():
                shutil.copy2(metadata_file, output_dir / f"{base_name}_metadata.json")

            copied_count += 1
        except Exception as exc:
            print(f"Error copying polygon {polygon_index}: {exc}")

    print(f"Copied {copied_count} geos to {output_dir}")

## Analysis Operations

In [None]:
def run_analysis(summaryjson_paths=("data/unita/summary.json", "data/chug/summary.json", "data/lluta/summary.json"), dataset_labels=("UNITA", "CHUG", "LLUTA")):
    """Load summaries, compute sizes, print stats, and plot figures."""
    summaries = [load_json(path) for path in summaryjson_paths]
    geos_list = [get_geos(summary) for summary in summaries]
    geos_sizes_list = [get_geos_sizes(geos) for geos in geos_list]

    print_geos_statistics(geos_sizes_list, dataset_labels)

    figs = {}
    figs["stats"] = plot_geos_statistics(geos_sizes_list, dataset_labels)
    figs["single_fig_all"] = plot_single_figure_all_datasets(
        geos_sizes_list, dataset_labels, "Geo Dimensions Distribution - All Datasets"
    )
    figs["combined"] = plot_combined_comparison(geos_sizes_list, dataset_labels)

    plt.show()
    return figs, geos_sizes_list

In [None]:
# Execute analysis with default datasets
run_analysis()