# CSB-FOSS: Explore Tennessee CDL Data

This notebook explores the CDL (Cropland Data Layer) and NAIP data for Tennessee
to understand the data structure and prepare for CSB generation.

In [None]:
from pathlib import Path
import numpy as np
import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt

# Configure paths
CDL_30M_DIR = Path(r"S:\_STAGING\01_RASTER_CORPUS\annual_cdl")
CDL_10M_DIR = Path(r"S:\_STAGING\01_RASTER_CORPUS\annual_cdl_10m")
NAIP_DIR = Path(r"S:\_STAGING\01_RASTER_CORPUS\naip_cog\Tennessee_Statewide")

print(f"CDL 30m exists: {CDL_30M_DIR.exists()}")
print(f"CDL 10m exists: {CDL_10M_DIR.exists()}")
print(f"NAIP exists: {NAIP_DIR.exists()}")

## 1. Explore CDL Directory Structure

In [None]:
# List CDL years available
if CDL_30M_DIR.exists():
    years = sorted([d.name for d in CDL_30M_DIR.iterdir() if d.is_dir()])
    print(f"CDL 30m years available: {years}")
    
    # Look for TIF files in a sample year
    sample_year = years[-1] if years else None
    if sample_year:
        sample_dir = CDL_30M_DIR / sample_year
        tifs = list(sample_dir.glob("*.tif"))
        print(f"\nFiles in {sample_year}: {[t.name for t in tifs[:5]]}")

## 2. Load and Examine a CDL Raster

In [None]:
# Find a CDL raster to examine
from csb_foss.raster.io import get_cdl_paths_for_years

try:
    cdl_paths = get_cdl_paths_for_years(CDL_30M_DIR, 2017, 2024)
    print(f"Found CDL paths:")
    for year, path in cdl_paths.items():
        print(f"  {year}: {path}")
except Exception as e:
    print(f"Error: {e}")
    print("\nTrying direct file search...")
    tifs = list(CDL_30M_DIR.rglob("*.tif"))
    print(f"Found {len(tifs)} TIF files")
    if tifs:
        print(f"First few: {[t.name for t in tifs[:3]]}")

In [None]:
# Open a CDL raster and examine properties
if cdl_paths:
    sample_path = list(cdl_paths.values())[0]
    
    with rasterio.open(sample_path) as src:
        print(f"CDL Raster Properties:")
        print(f"  Shape: {src.width} x {src.height}")
        print(f"  CRS: {src.crs}")
        print(f"  Bounds: {src.bounds}")
        print(f"  Resolution: {src.res}")
        print(f"  Dtype: {src.dtypes[0]}")
        print(f"  NoData: {src.nodata}")
        
        # Read a small window
        from rasterio.windows import Window
        window = Window(0, 0, 1000, 1000)
        data = src.read(1, window=window)
        
        print(f"\nSample data stats:")
        print(f"  Min: {data.min()}, Max: {data.max()}")
        print(f"  Unique values: {len(np.unique(data))}")

In [None]:
# Visualize CDL sample
if cdl_paths:
    fig, ax = plt.subplots(figsize=(10, 10))
    
    with rasterio.open(list(cdl_paths.values())[0]) as src:
        # Read a window
        window = Window(5000, 5000, 2000, 2000)
        data = src.read(1, window=window)
        
        im = ax.imshow(data, cmap='terrain')
        ax.set_title(f"CDL Sample ({list(cdl_paths.keys())[0]})")
        plt.colorbar(im, ax=ax, label="Crop Code")
        
    plt.tight_layout()
    plt.show()

## 3. Explore NAIP Data

In [None]:
# List NAIP files
if NAIP_DIR.exists():
    naip_files = list(NAIP_DIR.glob("*.tif"))
    print(f"NAIP files found: {len(naip_files)}")
    if naip_files:
        print(f"First few: {[f.name for f in naip_files[:5]]}")
        
        # Examine first file
        with rasterio.open(naip_files[0]) as src:
            print(f"\nNAIP Properties:")
            print(f"  Shape: {src.width} x {src.height}")
            print(f"  Bands: {src.count}")
            print(f"  CRS: {src.crs}")
            print(f"  Resolution: {src.res}")
            print(f"  Dtypes: {src.dtypes}")

## 4. Test CSB-FOSS Modules

In [None]:
# Test raster combination on a small area
from csb_foss.raster.combine import encode_year_sequence, calculate_crop_counts
from csb_foss.raster.io import read_multi_year_stack

if cdl_paths:
    # Read a small window
    window = Window(5000, 5000, 500, 500)
    stack, years, meta = read_multi_year_stack(cdl_paths, window=window)
    
    print(f"Stack shape: {stack.shape}")
    print(f"Years: {years}")
    
    # Encode
    coded, lookup = encode_year_sequence(stack)
    print(f"\nUnique signatures: {len(lookup)}")
    
    # Calculate counts
    counts = calculate_crop_counts(lookup, len(years))
    print(f"Sample counts (code -> (count0, count45)):")
    for code in list(counts.keys())[:5]:
        print(f"  {code}: {counts[code]} -> values {lookup[code]}")

In [None]:
# Visualize combined raster
if 'coded' in dir():
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # First year CDL
    im0 = axes[0].imshow(stack[0], cmap='terrain')
    axes[0].set_title(f"CDL {years[0]}")
    plt.colorbar(im0, ax=axes[0])
    
    # Last year CDL
    im1 = axes[1].imshow(stack[-1], cmap='terrain')
    axes[1].set_title(f"CDL {years[-1]}")
    plt.colorbar(im1, ax=axes[1])
    
    # Combined signature
    im2 = axes[2].imshow(coded, cmap='nipy_spectral')
    axes[2].set_title("Combined Signature")
    plt.colorbar(im2, ax=axes[2])
    
    plt.tight_layout()
    plt.show()

## 5. Test Edge Voting (Experimental)

In [None]:
# Test temporal edge voting
from csb_foss.experimental.edge_voting import compute_temporal_edge_votes

if 'stack' in dir():
    edge_votes = compute_temporal_edge_votes(stack, progress=False)
    
    print(f"Edge votes shape: {edge_votes.shape}")
    print(f"Min: {edge_votes.min()}, Max: {edge_votes.max()}")
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(edge_votes, cmap='hot')
    ax.set_title(f"Temporal Edge Votes (max={edge_votes.max()} years)")
    plt.colorbar(im, ax=ax, label="Years as edge")
    plt.show()

## 6. Summary and Next Steps

After running this notebook, you should have:
- Verified CDL data is accessible
- Understood the data structure and resolution
- Tested the CSB-FOSS modules work correctly

Next steps:
1. Run the full baseline pipeline on a small test area
2. Compare with existing CSB output
3. Test experimental segmentation approaches