# Microsoft Planetary Computer - FuseTS Data Preparation (FIXED)

**Fix Applied**: Use geometry clipping instead of bbox to handle CRS properly

The original error "No data found in bounds" occurred because:
- MPC Sentinel data comes in UTM projection
- We were passing WGS84 coordinates to `rio.clip_box()`
- Need to reproject geometry to match data CRS before clipping

## 1. Setup

In [None]:
# Install if needed:
# !pip install planetary-computer pystac-client rioxarray xarray geopandas

import warnings
warnings.filterwarnings('ignore')

import planetary_computer
import pystac_client
import rioxarray
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.ops import unary_union
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pathlib import Path

print("✅ Packages loaded")

## 2. Load Study Area

In [None]:
# Load shapefile
shapefile_path = 'data/klambu-glapan.shp'
paddy_gdf = gpd.read_file(shapefile_path)

print(f"Loaded {len(paddy_gdf)} features")
print(f"CRS: {paddy_gdf.crs}")

# Convert to UTM for buffering
paddy_utm = paddy_gdf.to_crs("EPSG:32749")
paddy_buffered_utm = paddy_utm.copy()
paddy_buffered_utm['geometry'] = paddy_utm.buffer(500)  # 500m buffer

# Merge and convert back to WGS84
merged_geom_utm = unary_union(paddy_buffered_utm.geometry)
buffered_gdf = gpd.GeoDataFrame(geometry=[merged_geom_utm], crs="EPSG:32749")
buffered_gdf_wgs84 = buffered_gdf.to_crs("EPSG:4326")

# Get bbox for STAC searches
west, south, east, north = buffered_gdf_wgs84.total_bounds
bbox = [west, south, east, north]

# Get geometry for clipping
study_area_geom = buffered_gdf_wgs84.geometry.iloc[0]

print(f"\nBBox: {bbox}")
print(f"Area: {buffered_gdf.area.sum() / 1e6:.2f} km²")

# Config
START_DATE = '2023-11-01'
END_DATE = '2025-11-07'
TARGET_CRS = "EPSG:32749"
TARGET_RESOLUTION = 10
OUTPUT_DIR = Path('mpc_data')
OUTPUT_DIR.mkdir(exist_ok=True)

## 3. Generate Periods

In [None]:
def generate_periods(start_str, end_str, days=12):
    start = datetime.strptime(start_str, '%Y-%m-%d')
    end = datetime.strptime(end_str, '%Y-%m-%d')
    periods = []
    period_num = 1
    current = start
    
    while current <= end:
        period_end = min(current + timedelta(days=days-1), end)
        periods.append({
            'period': period_num,
            'start_str': current.strftime('%Y-%m-%d'),
            'end_str': period_end.strftime('%Y-%m-%d'),
            'center_date': current + timedelta(days=days//2)
        })
        if period_end >= end:
            break
        current = period_end + timedelta(days=1)
        period_num += 1
    
    return periods

periods = generate_periods(START_DATE, END_DATE, 12)
print(f"Generated {len(periods)} periods")
print(f"First: {periods[0]['start_str']} to {periods[0]['end_str']}")
print(f"Last: {periods[-1]['start_str']} to {periods[-1]['end_str']}")

## 4. Connect to MPC

In [None]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace
)
print("✅ Connected to Microsoft Planetary Computer")

## 5. Sentinel-2 Functions (FIXED)

In [None]:
def search_sentinel2(bbox, start_date, end_date, max_cloud=80):
    """Search for S2 scenes"""
    search = catalog.search(
        collections=["sentinel-2-l2a"],
        bbox=bbox,
        datetime=f"{start_date}/{end_date}",
        query={"eo:cloud_cover": {"lt": max_cloud}}
    )
    return list(search.items())

def load_sentinel2_ndvi(items, study_geom, target_crs="EPSG:32749", resolution=10):
    """
    Load S2 and compute NDVI - FIXED VERSION
    Uses geometry clipping with proper CRS handling
    """
    if not items:
        return None
    
    ndvi_arrays = []
    
    for i, item in enumerate(items):
        try:
            # Get and sign assets
            red_href = planetary_computer.sign(item.assets["B04"].href)
            nir_href = planetary_computer.sign(item.assets["B08"].href)
            
            # Load bands
            red = rioxarray.open_rasterio(red_href, masked=True).squeeze()
            nir = rioxarray.open_rasterio(nir_href, masked=True).squeeze()
            
            # KEY FIX: Reproject study area to match data CRS
            study_proj = gpd.GeoSeries([study_geom], crs="EPSG:4326").to_crs(red.rio.crs)
            
            # Clip using geometry (not bbox)
            red_clip = red.rio.clip(study_proj.geometry, study_proj.crs, drop=True)
            nir_clip = nir.rio.clip(study_proj.geometry, study_proj.crs, drop=True)
            
            if red_clip.size == 0 or nir_clip.size == 0:
                print(f"      Scene {i+1}: No data after clip")
                continue
            
            # Reproject to target CRS
            if str(red_clip.rio.crs) != target_crs:
                red_clip = red_clip.rio.reproject(target_crs, resolution=resolution)
                nir_clip = nir_clip.rio.reproject(target_crs, resolution=resolution)
            
            # Calculate NDVI
            ndvi = (nir_clip - red_clip) / (nir_clip + red_clip)
            ndvi = ndvi.where(np.isfinite(ndvi))
            
            ndvi_arrays.append(ndvi)
            print(f"      Scene {i+1}: ✅ NDVI computed ({ndvi.shape})")
            
            # Clean up
            red.close()
            nir.close()
            
        except Exception as e:
            print(f"      Scene {i+1}: ❌ Error - {e}")
            continue
    
    if not ndvi_arrays:
        return None
    
    # Median composite
    stacked = xr.concat(ndvi_arrays, dim='time')
    median = stacked.median(dim='time', skipna=True)
    median = median.rio.write_crs(target_crs)
    
    return median

print("✅ S2 functions defined")

## 6. Sentinel-1 Functions (FIXED)

In [None]:
def search_sentinel1(bbox, start_date, end_date):
    """Search for S1 scenes"""
    search = catalog.search(
        collections=["sentinel-1-grd"],
        bbox=bbox,
        datetime=f"{start_date}/{end_date}",
        query={
            "sat:orbit_state": {"eq": "descending"},
            "sar:instrument_mode": {"eq": "IW"}
        }
    )
    return list(search.items())

def load_sentinel1(items, study_geom, target_crs="EPSG:32749", resolution=10):
    """
    Load S1 VV/VH - FIXED VERSION
    Uses geometry clipping with proper CRS handling
    """
    if not items:
        return None, None
    
    vv_arrays = []
    vh_arrays = []
    
    for i, item in enumerate(items):
        try:
            # Get and sign assets
            vv_href = planetary_computer.sign(item.assets["vv"].href)
            vh_href = planetary_computer.sign(item.assets["vh"].href)
            
            # Load bands
            vv = rioxarray.open_rasterio(vv_href, masked=True).squeeze()
            vh = rioxarray.open_rasterio(vh_href, masked=True).squeeze()
            
            # KEY FIX: Reproject study area to match data CRS
            study_proj = gpd.GeoSeries([study_geom], crs="EPSG:4326").to_crs(vv.rio.crs)
            
            # Clip
            vv_clip = vv.rio.clip(study_proj.geometry, study_proj.crs, drop=True)
            vh_clip = vh.rio.clip(study_proj.geometry, study_proj.crs, drop=True)
            
            if vv_clip.size == 0 or vh_clip.size == 0:
                print(f"      Scene {i+1}: No data after clip")
                continue
            
            # Reproject to target
            if str(vv_clip.rio.crs) != target_crs:
                vv_clip = vv_clip.rio.reproject(target_crs, resolution=resolution)
                vh_clip = vh_clip.rio.reproject(target_crs, resolution=resolution)
            
            vv_arrays.append(vv_clip)
            vh_arrays.append(vh_clip)
            print(f"      Scene {i+1}: ✅ VV/VH loaded ({vv_clip.shape})")
            
            vv.close()
            vh.close()
            
        except Exception as e:
            print(f"      Scene {i+1}: ❌ Error - {e}")
            continue
    
    if not vv_arrays:
        return None, None
    
    # Median composites
    vv_stacked = xr.concat(vv_arrays, dim='time')
    vh_stacked = xr.concat(vh_arrays, dim='time')
    
    vv_median = vv_stacked.median(dim='time', skipna=True).rio.write_crs(target_crs)
    vh_median = vh_stacked.median(dim='time', skipna=True).rio.write_crs(target_crs)
    
    return vv_median, vh_median

print("✅ S1 functions defined")

## 7. Process Test Periods

In [None]:
print("="*70)
print("🧪 TESTING WITH FIRST 3 PERIODS")
print("="*70)

test_results = []

for period in periods[:3]:
    print(f"\nPeriod {period['period']}: {period['start_str']} to {period['end_str']}")
    
    # Search S2
    print("  🛰️  Searching Sentinel-2...")
    s2_items = search_sentinel2(bbox, period['start_str'], period['end_str'])
    print(f"     Found {len(s2_items)} scenes")
    
    # Search S1
    print("  📡 Searching Sentinel-1...")
    s1_items = search_sentinel1(bbox, period['start_str'], period['end_str'])
    print(f"     Found {len(s1_items)} scenes")
    
    # Load S2
    ndvi = None
    if s2_items:
        print("  📥 Loading S2 NDVI...")
        ndvi = load_sentinel2_ndvi(s2_items, study_area_geom, TARGET_CRS, TARGET_RESOLUTION)
    
    # Load S1
    vv, vh = None, None
    if s1_items:
        print("  📥 Loading S1 VV/VH...")
        vv, vh = load_sentinel1(s1_items, study_area_geom, TARGET_CRS, TARGET_RESOLUTION)
    
    # Combine
    if ndvi is not None or (vv is not None and vh is not None):
        ref = ndvi if ndvi is not None else vv
        
        ds_dict = {}
        if vv is not None:
            ds_dict['VV'] = vv.rio.reproject_match(ref)
        if vh is not None:
            ds_dict['VH'] = vh.rio.reproject_match(ref)
        if ndvi is not None:
            ds_dict['S2ndvi'] = ndvi
        
        ds = xr.Dataset(ds_dict)
        ds = ds.assign_coords(t=period['center_date'])
        
        test_results.append({
            'period': period['period'],
            'dataset': ds,
            'n_s2': len(s2_items),
            'n_s1': len(s1_items)
        })
        print(f"  ✅ Period processed (S1: {len(s1_items)}, S2: {len(s2_items)})")
    else:
        print(f"  ⚠️  No data")

print(f"\n✅ Test complete: {len(test_results)}/{len(periods[:3])} periods successful")

## 8. Create Test Time Series

In [None]:
if test_results:
    # Combine
    test_ts = xr.concat([r['dataset'] for r in test_results], dim='t')
    test_ts = test_ts.transpose('t', 'y', 'x')
    
    print(f"Test time series:")
    print(test_ts)
    
    # Save
    test_file = OUTPUT_DIR / 'test_timeseries.nc'
    test_ts.to_netcdf(test_file)
    print(f"\n💾 Saved: {test_file}")
    
    # Visualize
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    if 'VV' in test_ts:
        test_ts['VV'].mean('t').plot(ax=axes[0], cmap='gray')
        axes[0].set_title('Mean VV')
    
    if 'VH' in test_ts:
        test_ts['VH'].mean('t').plot(ax=axes[1], cmap='gray')
        axes[1].set_title('Mean VH')
    
    if 'S2ndvi' in test_ts:
        test_ts['S2ndvi'].mean('t').plot(ax=axes[2], cmap='RdYlGn', vmin=-0.2, vmax=0.8)
        axes[2].set_title('Mean NDVI')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'test_preview.png', dpi=150)
    plt.show()
    
    print("\n✅ Test successful! You can now process all periods.")
else:
    print("\n⚠️  No test results")

## 9. Process ALL Periods (Full Run)

Once the test works, uncomment and run this to process all ~62 periods:

In [None]:
# def process_all_periods():
#     all_results = []
#     
#     for period in periods:
#         print(f"\nPeriod {period['period']}/{len(periods)}: {period['start_str']} to {period['end_str']}")
#         
#         s2_items = search_sentinel2(bbox, period['start_str'], period['end_str'])
#         s1_items = search_sentinel1(bbox, period['start_str'], period['end_str'])
#         
#         print(f"  S2: {len(s2_items)}, S1: {len(s1_items)}")
#         
#         ndvi = load_sentinel2_ndvi(s2_items, study_area_geom, TARGET_CRS, TARGET_RESOLUTION) if s2_items else None
#         vv, vh = load_sentinel1(s1_items, study_area_geom, TARGET_CRS, TARGET_RESOLUTION) if s1_items else (None, None)
#         
#         if ndvi is not None or (vv is not None and vh is not None):
#             ref = ndvi if ndvi is not None else vv
#             ds_dict = {}
#             if vv is not None: ds_dict['VV'] = vv.rio.reproject_match(ref)
#             if vh is not None: ds_dict['VH'] = vh.rio.reproject_match(ref)
#             if ndvi is not None: ds_dict['S2ndvi'] = ndvi
#             
#             ds = xr.Dataset(ds_dict).assign_coords(t=period['center_date'])
#             all_results.append(ds)
#             print(f"  ✅")
#     
#     # Create final time series
#     final_ts = xr.concat(all_results, dim='t').transpose('t', 'y', 'x')
#     
#     # Save
#     final_file = OUTPUT_DIR / f'klambu_glapan_{START_DATE}_{END_DATE}_final.nc'
#     final_ts.to_netcdf(final_file)
#     
#     print(f"\n✅ Complete! {len(all_results)} periods")
#     print(f"💾 Saved: {final_file}")
#     return final_ts
# 
# # Uncomment to run:
# # final_timeseries = process_all_periods()

## Key Fixes Applied

### Problem
```
Error loading scene: No data found in bounds.
```

### Root Cause
- MPC Sentinel data comes in UTM projection (e.g., EPSG:32649, EPSG:32749)
- We passed WGS84 coordinates (EPSG:4326) to `rio.clip_box()`
- The UTM data couldn't find lat/lon coordinates

### Solution
1. **Before**: `red.rio.clip_box(*bbox)` with WGS84 bbox
2. **After**: 
   - Reproject study area geometry to match data CRS
   - Use `rio.clip(geometry, crs)` instead of `clip_box()`
   - Then reproject result to target CRS

### Code Pattern
```python
# Load data (comes in native UTM)
data = rioxarray.open_rasterio(href, masked=True).squeeze()

# Reproject study area to match data CRS
study_proj = gpd.GeoSeries([study_geom], crs="EPSG:4326").to_crs(data.rio.crs)

# Clip using geometry
data_clipped = data.rio.clip(study_proj.geometry, study_proj.crs, drop=True)

# Reproject to target CRS
data_final = data_clipped.rio.reproject(target_crs, resolution=resolution)
```