# Bedmap Data Query Demo

This notebook demonstrates querying bedmap data from cloud-hosted GeoParquet files:
1. Querying the STAC catalog to find matching data files
2. Using DuckDB for efficient partial reads with spatial/temporal filters
3. Visualizing query results
4. Comparing bedmap with OPR layer data

In [None]:
# Import required libraries
import pandas as pd
import geopandas as gpd
import numpy as np
from datetime import datetime, timezone
import matplotlib.pyplot as plt
from shapely.geometry import box, Point
import warnings
warnings.filterwarnings('ignore')

# Import xopr bedmap modules
from xopr.bedmap import (
    query_bedmap,
    query_bedmap_catalog,
    compare_with_opr,
    match_bedmap_to_opr
)

print("xopr bedmap module loaded successfully!")

## 1. Query Bedmap Data from Cloud

The bedmap data is hosted on Google Cloud Storage at `gs://opr_stac/bedmap/`.
The query process works in two stages:
1. **STAC Catalog Query**: Find GeoParquet files that intersect with the query geometry/time
2. **DuckDB Partial Reads**: Fetch only relevant rows from those files using SQL pushdown

This approach minimizes data transfer - only the data you need is downloaded!

In [None]:
# Define a query region (Dronning Maud Land example)
query_bbox = box(-20, -76, -5, -72)  # lon_min, lat_min, lon_max, lat_max

# Visualize the query region
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
world = gpd.GeoDataFrame([1], geometry=[box(-180, -90, 180, 90)], crs='EPSG:4326')
antarctica = gpd.GeoDataFrame([1], geometry=[box(-180, -90, 180, -60)], crs='EPSG:4326')
query_region = gpd.GeoDataFrame([1], geometry=[query_bbox], crs='EPSG:4326')

world.plot(ax=ax, color='lightgray', edgecolor='black')
antarctica.plot(ax=ax, color='white', edgecolor='black')
query_region.plot(ax=ax, color='red', alpha=0.3, edgecolor='red', linewidth=2)

ax.set_xlim(-40, 10)
ax.set_ylim(-80, -65)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Query Region (Red Box) - Dronning Maud Land')
ax.grid(True, alpha=0.3)
plt.show()

print(f"Query bbox: {query_bbox.bounds}")

In [None]:
# First, query the catalog to see what files match our region
print("Querying STAC catalog for matching files...")

catalog_items = query_bedmap_catalog(
    catalog_path='gs://opr_stac/bedmap/',
    geometry=query_bbox,
    collections=['bedmap2']  # Filter by bedmap version
)

if not catalog_items.empty:
    print(f"\nFound {len(catalog_items)} matching files:")
    for _, row in catalog_items.iterrows():
        print(f"  - {row['name']}: {row['row_count']:,} rows")
else:
    print("No matching files found. Try a different region or bedmap version.")

In [None]:
# Now query the actual data from cloud storage
print("Querying bedmap data from cloud GeoParquet files...")
print(f"Query region: {query_bbox.bounds}")
print()

# Query with spatial filter - data is fetched directly from GCS
result_df = query_bedmap(
    geometry=query_bbox,
    collections=['bedmap2'],
    max_rows=5000,  # Limit for demo
    exclude_geometry=True
)

if not result_df.empty:
    print(f"\nRetrieved {len(result_df):,} points")
    print("\nFirst 5 rows:")
    display(result_df.head())
    
    print("\nData summary:")
    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
    display(result_df[numeric_cols].describe())
else:
    print("No data found in query region.")

## 2. Visualize Query Results

Let's visualize the retrieved data points and their properties.

In [None]:
if not result_df.empty and 'lon' in result_df.columns and 'lat' in result_df.columns:
    # Create GeoDataFrame for visualization
    gdf = gpd.GeoDataFrame(
        result_df,
        geometry=gpd.points_from_xy(result_df['lon'], result_df['lat']),
        crs='EPSG:4326'
    )
    
    # Find available columns for plotting
    surface_col = None
    thickness_col = None
    for col in result_df.columns:
        if 'surface' in col.lower() and 'altitude' in col.lower():
            surface_col = col
        if 'thickness' in col.lower():
            thickness_col = col
    
    # Create subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Surface altitude (if available)
    ax1 = axes[0]
    if surface_col and surface_col in gdf.columns:
        gdf.plot(column=surface_col, 
                 ax=ax1, 
                 legend=True,
                 cmap='terrain',
                 markersize=10,
                 legend_kwds={'label': 'Surface Altitude (m)'})
        ax1.set_title('Surface Altitude')
    else:
        gdf.plot(ax=ax1, markersize=10, color='blue')
        ax1.set_title('Data Points')
    query_region.boundary.plot(ax=ax1, color='red', linewidth=2)
    ax1.set_xlabel('Longitude')
    ax1.set_ylabel('Latitude')
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Ice thickness (if available)
    ax2 = axes[1]
    if thickness_col and thickness_col in gdf.columns:
        gdf.plot(column=thickness_col, 
                 ax=ax2, 
                 legend=True,
                 cmap='Blues',
                 markersize=10,
                 legend_kwds={'label': 'Ice Thickness (m)'})
        ax2.set_title('Ice Thickness')
    else:
        gdf.plot(ax=ax2, markersize=10, color='blue')
        ax2.set_title('Data Points')
    query_region.boundary.plot(ax=ax2, color='red', linewidth=2)
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No data to visualize.")

## 3. Query with Temporal Filter

You can also filter by date range to find data from specific time periods.

In [None]:
# Query with both spatial and temporal filters
from datetime import datetime

# Define query parameters
query_params = {
    'geometry': box(-15, -75, -5, -73),  # Smaller region
    'date_range': (datetime(1994, 1, 1), datetime(2000, 12, 31)),  # 1994-2000
    'collections': ['bedmap2'],
    'max_rows': 2000,
}

print("Query parameters:")
print(f"  Spatial: {query_params['geometry'].bounds}")
print(f"  Temporal: {query_params['date_range'][0]} to {query_params['date_range'][1]}")
print(f"  Collections: {query_params['collections']}")
print(f"  Max rows: {query_params['max_rows']}")

# Execute query
temporal_result = query_bedmap(**query_params)

if not temporal_result.empty:
    print(f"\nRetrieved {len(temporal_result):,} points")
    
    # Show unique source files
    if 'source_file' in temporal_result.columns:
        print(f"\nSource files:")
        for src in temporal_result['source_file'].unique():
            count = (temporal_result['source_file'] == src).sum()
            print(f"  - {src}: {count:,} points")
else:
    print("No data found for the specified query.")

## 4. Compare with OPR Data (Example)

The comparison functions allow matching bedmap measurements with OPR layer picks.
This example shows how the matching works with mock OPR data.

In [None]:
import xarray as xr

if not result_df.empty and 'lon' in result_df.columns:
    # Create a mock OPR dataset for demonstration
    # In practice, this would be loaded from actual OPR files
    n_opr_points = 100
    opr_lons = np.random.uniform(
        result_df['lon'].min(),
        result_df['lon'].max(),
        n_opr_points
    )
    opr_lats = np.random.uniform(
        result_df['lat'].min(),
        result_df['lat'].max(),
        n_opr_points
    )
    
    # Create mock surface and bed elevations
    opr_surface = np.random.normal(2000, 200, n_opr_points)
    opr_bed = np.random.normal(500, 100, n_opr_points)
    
    # Create xarray dataset
    opr_dataset = xr.Dataset({
        'Longitude': (('slow_time',), opr_lons),
        'Latitude': (('slow_time',), opr_lats),
        'Surface': (('slow_time',), opr_surface),
        'Bottom': (('slow_time',), opr_bed),
    })
    
    print("Mock OPR dataset created:")
    print(opr_dataset)
    
    # Match bedmap points to nearest OPR measurements
    bedmap_subset = gpd.GeoDataFrame(
        result_df.head(50),
        geometry=gpd.points_from_xy(result_df.head(50)['lon'], result_df.head(50)['lat']),
        crs='EPSG:4326'
    )
    
    matched_data = match_bedmap_to_opr(
        bedmap_subset,
        opr_dataset,
        max_distance_m=10000  # 10 km matching tolerance for demo
    )
    
    # Show matching results
    print(f"\nMatching results:")
    print(f"  Total bedmap points: {len(matched_data)}")
    print(f"  Matched points: {matched_data['is_matched'].sum()}")
    print(f"  Average match distance: {matched_data['opr_match_distance_m'].mean():.1f} m")
else:
    print("No data available for comparison demo.")

## 5. Summary

This demo showed the bedmap query workflow:

### Key Features:
- **Cloud-native queries**: Data is hosted on GCS and queried directly without downloading full files
- **STAC catalog**: Find relevant files using spatial/temporal filters
- **DuckDB partial reads**: Efficient SQL pushdown to read only needed rows
- **OPR comparison**: Match bedmap measurements with radar layer picks

### Query Process:
1. **STAC Query** - Find files intersecting with query geometry/time
2. **DuckDB Partial Reads** - Fetch only rows within bounding box
3. **Optional Refinement** - Apply precise geometry filter if needed

### Data Location:
- Catalog: `gs://opr_stac/bedmap/bedmap{1,2,3}.parquet`
- Data: `gs://opr_stac/bedmap/data/*.parquet`