# Bedmap Data Integration Demo

This notebook demonstrates the bedmap data integration features in xopr:
1. Converting bedmap CSV files to cloud-optimized GeoParquet
2. Building STAC catalogs for data discovery
3. Querying data efficiently with spatial/temporal filters
4. Comparing bedmap with OPR layer data

In [None]:
# Import required libraries
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
from datetime import datetime, timezone
import matplotlib.pyplot as plt
from shapely.geometry import box, Point
import warnings
warnings.filterwarnings('ignore')

# Import xopr bedmap modules
from xopr.bedmap import (
    convert_bedmap_csv,
    batch_convert_bedmap,
    build_bedmap_catalog,
    query_bedmap,
    query_bedmap_local,
    compare_with_opr,
    match_bedmap_to_opr
)

print("xopr bedmap module loaded successfully!")

## 1. Convert Bedmap CSV to GeoParquet

The conversion process:
- Parses metadata from CSV headers
- Handles complex date/time with fallback strategies
- Extracts flight line geometries (multiline with 10km segmentation)
- Creates cloud-optimized GeoParquet files

In [None]:
# Set paths
bedmap_dir = Path('~/software/bedmap/Results').expanduser()
output_dir = Path('scripts/output/bedmap')

# Convert a single file as example
csv_files = sorted(bedmap_dir.glob('*.csv'))[:1]  # Just first file for demo

if csv_files:
    print(f"Converting {csv_files[0].name}...")
    
    # Convert CSV to GeoParquet
    metadata = convert_bedmap_csv(
        csv_files[0],
        output_dir,
        simplify_tolerance_deg=0.01
    )
    
    print("\nConversion metadata:")
    print(f"  Bedmap version: {metadata['bedmap_version']}")
    print(f"  Row count: {metadata['row_count']:,}")
    print(f"  Spatial bbox: {metadata['spatial_bounds']['bbox']}")
    print(f"  Temporal range: {metadata['temporal_bounds']['start']} to {metadata['temporal_bounds']['end']}")
else:
    print("No CSV files found. Please check the bedmap_dir path.")

## 2. Query Process: STAC + DuckDB

The query process works in two stages:
1. **STAC Query**: Find files that intersect with the query geometry/time
2. **DuckDB Partial Reads**: Fetch only relevant rows from those files

This minimizes data transfer and processing!

In [None]:
# Example: Query data for a specific region
# Define a bounding box (West Antarctica example)
query_bbox = box(-80, -78, -70, -74)  # lon_min, lat_min, lon_max, lat_max

# Visualize the query region
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
world = gpd.GeoDataFrame([1], geometry=[box(-180, -90, 180, 90)], crs='EPSG:4326')
antarctica = gpd.GeoDataFrame([1], geometry=[box(-180, -90, 180, -60)], crs='EPSG:4326')
query_region = gpd.GeoDataFrame([1], geometry=[query_bbox], crs='EPSG:4326')

world.plot(ax=ax, color='lightgray', edgecolor='black')
antarctica.plot(ax=ax, color='white', edgecolor='black')
query_region.plot(ax=ax, color='red', alpha=0.3, edgecolor='red', linewidth=2)

ax.set_xlim(-100, -50)
ax.set_ylim(-85, -65)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Query Region (Red Box)')
ax.grid(True, alpha=0.3)
plt.show()

print(f"Query bbox: {query_bbox.bounds}")

In [None]:
# Query local parquet files (without STAC)
# This demonstrates the DuckDB partial read capability

print("Querying bedmap data from local parquet files...")
print(f"Query region: {query_bbox.bounds}")
print()

# Query with spatial filter
result_df = query_bedmap_local(
    parquet_dir=output_dir,
    geometry=query_bbox,
    columns=['longitude (degree_east)', 'latitude (degree_north)', 
             'surface_altitude (m)', 'land_ice_thickness (m)', 
             'source_file', 'timestamp'],
    max_items=1000,
    exclude_geometry=True
)

print(f"Retrieved {len(result_df):,} points from parquet files")

if not result_df.empty:
    print("\nFirst 5 rows:")
    display(result_df.head())
    
    print("\nData summary:")
    print(result_df[['surface_altitude (m)', 'land_ice_thickness (m)']].describe())
else:
    print("No data found in query region. Try a different bbox or convert more files.")

## 3. Visualize Query Results

Let's visualize the retrieved data points and their properties.

In [None]:
if not result_df.empty:
    # Create GeoDataFrame for visualization
    gdf = gpd.GeoDataFrame(
        result_df,
        geometry=gpd.points_from_xy(
            result_df['longitude (degree_east)'],
            result_df['latitude (degree_north)']
        ),
        crs='EPSG:4326'
    )
    
    # Create subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Surface altitude
    ax1 = axes[0]
    gdf.plot(column='surface_altitude (m)', 
             ax=ax1, 
             legend=True,
             cmap='terrain',
             markersize=10,
             legend_kwds={'label': 'Surface Altitude (m)'})
    query_region.boundary.plot(ax=ax1, color='red', linewidth=2)
    ax1.set_xlabel('Longitude')
    ax1.set_ylabel('Latitude')
    ax1.set_title('Surface Altitude')
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Ice thickness
    ax2 = axes[1]
    gdf.plot(column='land_ice_thickness (m)', 
             ax=ax2, 
             legend=True,
             cmap='Blues',
             markersize=10,
             legend_kwds={'label': 'Ice Thickness (m)'})
    query_region.boundary.plot(ax=ax2, color='red', linewidth=2)
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.set_title('Ice Thickness')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Show data distribution
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Histogram of surface altitude
    axes[0].hist(gdf['surface_altitude (m)'].dropna(), bins=30, edgecolor='black')
    axes[0].set_xlabel('Surface Altitude (m)')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Surface Altitude Distribution')
    axes[0].grid(True, alpha=0.3)
    
    # Histogram of ice thickness
    axes[1].hist(gdf['land_ice_thickness (m)'].dropna(), bins=30, edgecolor='black', color='blue', alpha=0.7)
    axes[1].set_xlabel('Ice Thickness (m)')
    axes[1].set_ylabel('Count')
    axes[1].set_title('Ice Thickness Distribution')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No data to visualize.")

## 4. Advanced Query with STAC

When using the full STAC catalog, the query process is:
1. Query STAC for files that intersect the search area
2. Use DuckDB to read only the relevant portions of those files

In [None]:
# Demonstrate the full query_bedmap function (would use STAC catalog if available)
# This shows how the API matches query_frames()

from datetime import datetime

# Define query parameters
query_params = {
    'geometry': box(-75, -76, -70, -74),  # Spatial filter
    'date_range': (datetime(1990, 1, 1), datetime(2000, 12, 31)),  # Temporal filter
    'collections': ['bedmap-bm2'],  # Filter by bedmap version
    'max_items': 500,  # Limit results
    'columns': [  # Specific columns to retrieve
        'longitude (degree_east)',
        'latitude (degree_north)', 
        'surface_altitude (m)',
        'land_ice_thickness (m)',
        'bedrock_altitude (m)',
        'timestamp',
        'source_file'
    ],
}

print("Query parameters:")
print(f"  Spatial: {query_params['geometry'].bounds}")
print(f"  Temporal: {query_params['date_range'][0]} to {query_params['date_range'][1]}")
print(f"  Collections: {query_params['collections']}")
print(f"  Max items: {query_params['max_items']}")
print(f"  Columns: {len(query_params['columns'])} selected")

# Note: This would normally query the STAC catalog first
# For demo, we'll use local query
print("\n[In production, this would query STAC catalog first, then fetch from cloud storage]")

## 5. Compare with OPR Data (Example)

The comparison functions allow matching bedmap measurements with OPR layer picks.

In [None]:
# Example: Create mock OPR data for demonstration
import xarray as xr

if not result_df.empty:
    # Create a mock OPR dataset for demonstration
    # In practice, this would be loaded from actual OPR files
    n_opr_points = 100
    opr_lons = np.random.uniform(
        result_df['longitude (degree_east)'].min(),
        result_df['longitude (degree_east)'].max(),
        n_opr_points
    )
    opr_lats = np.random.uniform(
        result_df['latitude (degree_north)'].min(),
        result_df['latitude (degree_north)'].max(),
        n_opr_points
    )
    
    # Create mock surface and bed elevations (with some noise)
    opr_surface = np.random.normal(1000, 100, n_opr_points)
    opr_bed = np.random.normal(500, 50, n_opr_points)
    
    # Create xarray dataset
    opr_dataset = xr.Dataset({
        'Longitude': (('slow_time',), opr_lons),
        'Latitude': (('slow_time',), opr_lats),
        'Surface': (('slow_time',), opr_surface),
        'Bottom': (('slow_time',), opr_bed),
    })
    
    print("Mock OPR dataset created:")
    print(opr_dataset)
    
    # Match bedmap points to nearest OPR measurements
    bedmap_subset = gpd.GeoDataFrame(result_df.head(50))  # Use subset for demo
    
    matched_data = match_bedmap_to_opr(
        bedmap_subset,
        opr_dataset,
        max_distance_m=5000  # 5 km matching tolerance
    )
    
    # Show matching results
    print(f"\nMatching results:")
    print(f"  Total bedmap points: {len(matched_data)}")
    print(f"  Matched points: {matched_data['is_matched'].sum()}")
    print(f"  Average match distance: {matched_data['opr_match_distance_m'].mean():.1f} m")
    
    # Visualize matches
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Plot bedmap points
    ax.scatter(bedmap_subset['longitude (degree_east)'], 
               bedmap_subset['latitude (degree_north)'],
               c='blue', label='Bedmap', s=50, alpha=0.6)
    
    # Plot OPR points
    ax.scatter(opr_lons, opr_lats, 
               c='red', label='OPR', s=30, alpha=0.6)
    
    # Draw lines for matches
    for idx, row in matched_data[matched_data['is_matched']].iterrows():
        ax.plot([row['longitude (degree_east)'], row['opr_longitude']],
                [row['latitude (degree_north)'], row['opr_latitude']],
                'g-', alpha=0.3, linewidth=0.5)
    
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('Bedmap to OPR Matching (Green Lines = Matches)')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.show()
else:
    print("No data available for comparison demo.")

## 6. Query Performance Analysis

Let's analyze the efficiency of the two-stage query process.

In [None]:
import time

# Compare query performance
print("Query Performance Analysis")
print("="*50)

# Define test queries of different sizes
test_queries = [
    ('Small', box(-72, -75, -70, -74)),   # ~2x1 degree
    ('Medium', box(-75, -76, -70, -74)),  # ~5x2 degrees
    ('Large', box(-80, -78, -70, -74)),   # ~10x4 degrees
]

for name, bbox in test_queries:
    print(f"\n{name} Query: {bbox.bounds}")
    print(f"  Area: {bbox.area:.1f} square degrees")
    
    # Time the query
    start_time = time.time()
    
    result = query_bedmap_local(
        parquet_dir=output_dir,
        geometry=bbox,
        max_items=10000
    )
    
    query_time = time.time() - start_time
    
    print(f"  Query time: {query_time:.3f} seconds")
    print(f"  Points retrieved: {len(result):,}")
    
    if len(result) > 0:
        print(f"  Points/second: {len(result)/query_time:,.0f}")

print("\n" + "="*50)
print("Key advantages of the two-stage approach:")
print("1. STAC filters files before reading (reduces I/O)")
print("2. DuckDB reads only necessary columns (column pruning)")
print("3. Spatial filter applied during read (row filtering)")
print("4. Files never fully loaded into memory")

## 7. Summary and Next Steps

This demo showed the complete bedmap integration workflow:

### What we've implemented:
✅ **CSV → GeoParquet conversion** with complex date handling  
✅ **Flight line extraction** with 10km segmentation  
✅ **STAC catalog generation** for discovery  
✅ **Efficient queries** using STAC + DuckDB  
✅ **OPR comparison** functions  

### Query Process Recap:
1. **STAC Query** - Find files intersecting with query geometry/time
2. **DuckDB Partial Reads** - Fetch only rows within bounding box
3. **Optional Refinement** - Apply precise geometry filter if needed

### Next Steps:
1. Convert all 151 bedmap CSV files
2. Build complete STAC catalog
3. Upload to Google Cloud Storage
4. Test with real OPR data comparisons
5. Integrate into production workflows