# CSB-FOSS: Baseline Pipeline Test

This notebook tests the baseline (ArcGIS-compatible) pipeline on a small area of Tennessee.

## Pipeline Steps
1. Combine CDL rasters (8 years)
2. Vectorize combined raster
3. Filter by crop presence (COUNT0/COUNT45)
4. Eliminate small polygons (tiered: 100, 1000, 10000 m²)
5. Simplify polygons (60m tolerance)

In [None]:
import sys
from pathlib import Path
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from rasterio.windows import Window

# Add src to path if needed
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from csb_foss.config import CSBConfig, DataPaths, ProcessingParams, OutputPaths
from csb_foss.raster.io import get_cdl_paths_for_years, read_multi_year_stack
from csb_foss.raster.combine import combine_cdl_rasters, encode_year_sequence, calculate_crop_counts

print("Imports successful!")

## 1. Configuration

In [None]:
# Data paths
CDL_DIR = Path(r"S:\_STAGING\01_RASTER_CORPUS\annual_cdl")
OUTPUT_DIR = Path("../output/baseline_test")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Parameters
START_YEAR = 2017
END_YEAR = 2024

# Test window (small area for quick testing)
# Adjust these based on your CDL extent
TEST_WINDOW = Window(col_off=5000, row_off=5000, width=1000, height=1000)

print(f"CDL directory: {CDL_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Years: {START_YEAR}-{END_YEAR}")
print(f"Test window: {TEST_WINDOW}")

## 2. Find CDL Rasters

In [None]:
# Find CDL paths
cdl_paths = get_cdl_paths_for_years(CDL_DIR, START_YEAR, END_YEAR)

print(f"Found {len(cdl_paths)} CDL rasters:")
for year, path in sorted(cdl_paths.items()):
    print(f"  {year}: {path.name}")

## 3. Load and Combine CDL Stack

In [None]:
# Read multi-year stack for test window
print("Loading CDL stack...")
stack, years, metadata = read_multi_year_stack(cdl_paths, window=TEST_WINDOW)

print(f"Stack shape: {stack.shape} (years, height, width)")
print(f"Years: {years}")
print(f"Transform: {metadata['transform']}")
print(f"CRS: {metadata['crs']}")

In [None]:
# Encode year sequence
print("Encoding year sequences...")
coded, lookup = encode_year_sequence(stack)

print(f"Coded raster shape: {coded.shape}")
print(f"Unique signatures: {len(lookup)}")

# Calculate counts
counts = calculate_crop_counts(lookup, len(years))

# Show sample
print("\nSample signatures (code -> values -> count0, count45):")
for i, (code, values) in enumerate(list(lookup.items())[:5]):
    c0, c45 = counts[code]
    print(f"  {code}: {values} -> COUNT0={c0}, COUNT45={c45}")

In [None]:
# Visualize
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, (year, ax) in enumerate(zip(years, axes.flat)):
    im = ax.imshow(stack[i], cmap='terrain', vmin=0, vmax=255)
    ax.set_title(f"CDL {year}")
    ax.axis('off')

plt.suptitle("CDL Time Series")
plt.tight_layout()
plt.show()

# Combined signature
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(coded, cmap='nipy_spectral')
ax.set_title(f"Combined Signatures ({len(lookup)} unique)")
plt.colorbar(im, ax=ax)
plt.show()

## 4. Vectorize Combined Raster

In [None]:
from rasterio.features import shapes
from shapely.geometry import shape
import json

print("Vectorizing...")

# Create lookup JSON for enrichment
lookup_data = {
    str(k): {
        "values": list(v),
        "years": years,
        "count0": counts[k][0],
        "count45": counts[k][1],
    }
    for k, v in lookup.items()
}

# Vectorize
geometries = []
gridcodes = []

for geom, val in shapes(coded.astype('int32'), transform=metadata['transform']):
    geometries.append(shape(geom))
    gridcodes.append(int(val))

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
    {"gridcode": gridcodes},
    geometry=geometries,
    crs=metadata['crs'],
)

print(f"Created {len(gdf)} polygons")

In [None]:
# Enrich with attributes from lookup
for i, year in enumerate(years):
    col = f"cdl_{year}"
    gdf[col] = gdf["gridcode"].apply(
        lambda x: lookup_data.get(str(x), {}).get("values", [None] * len(years))[i]
    )

gdf["count0"] = gdf["gridcode"].apply(
    lambda x: lookup_data.get(str(x), {}).get("count0", 0)
)
gdf["count45"] = gdf["gridcode"].apply(
    lambda x: lookup_data.get(str(x), {}).get("count45", 0)
)
gdf["shape_area"] = gdf.geometry.area

print(f"Columns: {list(gdf.columns)}")
gdf.head()

## 5. Filter by Crop Presence

In [None]:
from csb_foss.vector.vectorize import filter_by_crop_presence

print(f"Before filter: {len(gdf)} polygons")

gdf_filtered = filter_by_crop_presence(
    gdf,
    min_crop_years=2,
    min_area_single_year=10000,
)

print(f"After filter: {len(gdf_filtered)} polygons")
print(f"Removed: {len(gdf) - len(gdf_filtered)} non-cropland polygons")

In [None]:
# Visualize filtered vs unfiltered
fig, axes = plt.subplots(1, 2, figsize=(14, 7))

gdf.plot(ax=axes[0], column='count0', cmap='YlGn', legend=True)
axes[0].set_title(f"All polygons ({len(gdf)})")

gdf_filtered.plot(ax=axes[1], column='count0', cmap='YlGn', legend=True)
axes[1].set_title(f"Filtered polygons ({len(gdf_filtered)})")

plt.tight_layout()
plt.show()

## 6. Eliminate Small Polygons

In [None]:
from csb_foss.vector.eliminate import tiered_eliminate

print(f"Before elimination: {len(gdf_filtered)} polygons")
print(f"Area range: {gdf_filtered.shape_area.min():.1f} - {gdf_filtered.shape_area.max():.1f} m²")

gdf_eliminated = tiered_eliminate(
    gdf_filtered,
    thresholds=[100, 1000, 10000, 10000],
    progress=True,
)

print(f"\nAfter elimination: {len(gdf_eliminated)} polygons")
print(f"Area range: {gdf_eliminated.shape_area.min():.1f} - {gdf_eliminated.shape_area.max():.1f} m²")

## 7. Simplify Polygons

In [None]:
from csb_foss.vector.simplify import simplify_polygons

print(f"Before simplification: {len(gdf_eliminated)} polygons")

gdf_simplified = simplify_polygons(
    gdf_eliminated,
    tolerance=60.0,  # 60 meters (matching ArcGIS)
    progress=True,
)

print(f"After simplification: {len(gdf_simplified)} polygons")

In [None]:
# Compare before/after simplification
fig, axes = plt.subplots(1, 2, figsize=(14, 7))

gdf_eliminated.plot(ax=axes[0], facecolor='lightblue', edgecolor='darkblue', linewidth=0.5)
axes[0].set_title(f"Before simplification ({len(gdf_eliminated)} polygons)")

gdf_simplified.plot(ax=axes[1], facecolor='lightgreen', edgecolor='darkgreen', linewidth=0.5)
axes[1].set_title(f"After simplification ({len(gdf_simplified)} polygons)")

plt.tight_layout()
plt.show()

## 8. Save Results

In [None]:
# Save final result
output_path = OUTPUT_DIR / "csb_baseline_test.gpkg"
gdf_simplified.to_file(output_path, driver="GPKG")
print(f"Saved to: {output_path}")

# Summary statistics
print(f"\n=== BASELINE PIPELINE SUMMARY ===")
print(f"Input years: {START_YEAR}-{END_YEAR} ({len(years)} years)")
print(f"Test area: {TEST_WINDOW.width} x {TEST_WINDOW.height} pixels")
print(f"Unique signatures: {len(lookup)}")
print(f"Raw polygons: {len(gdf)}")
print(f"After crop filter: {len(gdf_filtered)}")
print(f"After elimination: {len(gdf_eliminated)}")
print(f"Final polygons: {len(gdf_simplified)}")
print(f"Total area: {gdf_simplified.shape_area.sum() / 10000:.1f} hectares")

## 9. Next Steps

1. **Scale up**: Increase test window size or process full state
2. **Validate**: Compare with existing CSB output
3. **Add attributes**: Run prep stage for admin boundaries and crop majority
4. **Try experimental**: Run notebook 03 for improved segmentation