In [1]:
import xarray as xr
from pathlib import Path

In [2]:
import os
os.getcwd()

'/home/sngrj0hn/GitHub/neuralhydrology/nh_forecast_example'

In [3]:
# Define the path to the NetCDF file
file_path = Path('./camels_gb_graphcast_10d_sample/time_series/pseudo/23006.nc')

# Check if the file exists
if not file_path.is_file():
    raise FileNotFoundError(f'NetCDF file not found: {file_path}')

# Load the NetCDF file
ds = xr.open_dataset(file_path)

# Display the dataset structure
print(ds)

# Inspect global attributes
print('\nGlobal Attributes:')
print(ds.attrs)

# Inspect variables and dimensions
print('\nVariables:')
for var in ds.variables:
    print(f'{var}: {ds[var].dims}, {ds[var].shape}')

# Example: Extract data for a specific variable (e.g., temperature, if available)
if 'temperature' in ds.variables:
    temperature = ds['temperature']
    print('\nTemperature Data:')
    print(temperature)

<xarray.Dataset> Size: 2MB
Dimensions:                                             (date: 8401,
                                                         lead_time: 10)
Coordinates:
  * date                                                (date) datetime64[ns] 67kB ...
  * lead_time                                           (lead_time) timedelta64[ns] 80B ...
Data variables:
    graphcast_temperature_2m                            (date, lead_time) float32 336kB ...
    graphcast_total_precipitation                       (date, lead_time) float32 336kB ...
    graphcast_u_component_of_wind_10m                   (date, lead_time) float32 336kB ...
    graphcast_v_component_of_wind_10m                   (date, lead_time) float32 336kB ...
    era5land_temperature_2m                             (date) float32 34kB ...
    era5land_total_precipitation                        (date) float32 34kB ...
    era5land_u_component_of_wind_10m                    (date) float32 34kB ...
    era5land_v_c

  ds = xr.open_dataset(file_path)


In [4]:
# Detailed analysis of the dataset structure
print("=== DATASET STRUCTURE ANALYSIS ===\n")

print("📊 Dataset Dimensions:")
for dim, size in ds.dims.items():
    print(f"  • {dim}: {size}")

print(f"\n🏷️ Coordinates:")
for coord in ds.coords:
    coord_info = ds.coords[coord]
    print(f"  • {coord}: {coord_info.dims} - shape {coord_info.shape}")
    
    # Show sample values for different coordinate types
    if coord == 'time':
        print(f"    └─ Time range: {coord_info.values.min()} to {coord_info.values.max()}")
        print(f"    └─ Sample times: {coord_info.values[:3]}")
    elif coord == 'lead_time':
        print(f"    └─ Lead time range: {coord_info.values.min()} to {coord_info.values.max()}")
        print(f"    └─ Sample lead times: {coord_info.values[:5]}")
    elif coord == 'basin':
        print(f"    └─ Basin ID: {coord_info.values}")
    elif len(coord_info.values) < 10:
        print(f"    └─ Values: {coord_info.values}")

print(f"\n📋 Data Variables:")
for var_name in ds.data_vars:
    var_info = ds[var_name]
    print(f"  • {var_name}: {var_info.dims} - shape {var_info.shape}")
    if hasattr(var_info, 'attrs') and var_info.attrs:
        if 'long_name' in var_info.attrs:
            print(f"    └─ {var_info.attrs['long_name']}")
        elif 'units' in var_info.attrs:
            print(f"    └─ Units: {var_info.attrs['units']}")

# Check for patterns in variable names
forecast_vars = []
hindcast_vars = []

for var_name in ds.data_vars:
    var_dims = ds[var_name].dims
    if 'lead_time' in var_dims:
        forecast_vars.append(var_name)
    else:
        hindcast_vars.append(var_name)

print(f"\n🔵 Variables WITHOUT lead_time (hindcast): {len(hindcast_vars)}")
for var in hindcast_vars[:10]:  # Show first 10
    print(f"    • {var}: {ds[var].dims}")

print(f"\n🔴 Variables WITH lead_time (forecast): {len(forecast_vars)}")
for var in forecast_vars[:10]:  # Show first 10
    print(f"    • {var}: {ds[var].dims}")

if len(forecast_vars) > 10:
    print(f"    ... and {len(forecast_vars) - 10} more")
if len(hindcast_vars) > 10:
    print(f"    ... and {len(hindcast_vars) - 10} more")

=== DATASET STRUCTURE ANALYSIS ===

📊 Dataset Dimensions:
  • date: 8401
  • lead_time: 10

🏷️ Coordinates:
  • date: ('date',) - shape (8401,)
  • lead_time: ('lead_time',) - shape (10,)
    └─ Lead time range: 86400000000000 nanoseconds to 864000000000000 nanoseconds
    └─ Sample lead times: [ 86400000000000 172800000000000 259200000000000 345600000000000
 432000000000000]

📋 Data Variables:
  • graphcast_temperature_2m: ('date', 'lead_time') - shape (8401, 10)
  • graphcast_total_precipitation: ('date', 'lead_time') - shape (8401, 10)
  • graphcast_u_component_of_wind_10m: ('date', 'lead_time') - shape (8401, 10)
  • graphcast_v_component_of_wind_10m: ('date', 'lead_time') - shape (8401, 10)
  • era5land_temperature_2m: ('date',) - shape (8401,)
  • era5land_total_precipitation: ('date',) - shape (8401,)
  • era5land_u_component_of_wind_10m: ('date',) - shape (8401,)
  • era5land_v_component_of_wind_10m: ('date',) - shape (8401,)
  • era5land_potential_evaporation_FAO_PENMAN_MONTEI

  for dim, size in ds.dims.items():


In [6]:
# Simple structure summary
print("DATASET SUMMARY:")
print("================")

print(f"Dimensions: {dict(ds.sizes)}")
print(f"Total variables: {len(ds.data_vars)}")

# Sample a few variables to understand the structure
sample_vars = list(ds.data_vars)[:5]
print(f"\nSample variables:")
for var in sample_vars:
    print(f"  {var}: {ds[var].dims} - {ds[var].shape}")

# Check coordinate values
print(f"\nCoordinate details:")
for coord in ds.coords:
    print(f"  {coord}: {ds.coords[coord].values.shape}")
    if coord == 'date':
        print(f"    └─ Date range: {ds.coords[coord].values.min()} to {ds.coords[coord].values.max()}")
    elif coord == 'lead_time':
        print(f"    └─ Lead time values: {ds.coords[coord].values}")

# Check for forecast vs hindcast pattern
has_lead_time = []
no_lead_time = []

for var in ds.data_vars:
    if 'lead_time' in ds[var].dims:
        has_lead_time.append(var)
    else:
        no_lead_time.append(var)

print(f"\nVariable patterns:")
print(f"  Variables with lead_time (forecast): {len(has_lead_time)}")
print(f"  Variables without lead_time (hindcast): {len(no_lead_time)}")

# Show examples of each
if has_lead_time:
    print(f"  Forecast examples: {has_lead_time[:3]}")
if no_lead_time:
    print(f"  Hindcast examples: {no_lead_time[:3]}")

# Check for data source patterns
graphcast_vars = [v for v in ds.data_vars if v.startswith('graphcast_')]
era5_vars = [v for v in ds.data_vars if v.startswith('era5')]
other_vars = [v for v in ds.data_vars if not v.startswith('graphcast_') and not v.startswith('era5')]

print(f"\nData source patterns:")
print(f"  GraphCast variables: {len(graphcast_vars)}")
print(f"  ERA5 variables: {len(era5_vars)}")
print(f"  Other variables: {len(other_vars)} - {other_vars}")

# Show all variable names organized by type
print(f"\nAll variables:")
print(f"  GraphCast (forecast): {graphcast_vars}")
print(f"  ERA5 (reanalysis): {era5_vars}")
if other_vars:
    print(f"  Other: {other_vars}")

DATASET SUMMARY:
Dimensions: {'date': 8401, 'lead_time': 10}
Total variables: 10

Sample variables:
  graphcast_temperature_2m: ('date', 'lead_time') - (8401, 10)
  graphcast_total_precipitation: ('date', 'lead_time') - (8401, 10)
  graphcast_u_component_of_wind_10m: ('date', 'lead_time') - (8401, 10)
  graphcast_v_component_of_wind_10m: ('date', 'lead_time') - (8401, 10)
  era5land_temperature_2m: ('date',) - (8401,)

Coordinate details:
  date: (8401,)
    └─ Date range: 1994-01-01T00:00:00.000000000 to 2016-12-31T00:00:00.000000000
  lead_time: (10,)
    └─ Lead time values: [ 86400000000000 172800000000000 259200000000000 345600000000000
 432000000000000 518400000000000 604800000000000 691200000000000
 777600000000000 864000000000000]

Variable patterns:
  Variables with lead_time (forecast): 4
  Variables without lead_time (hindcast): 6
  Forecast examples: ['graphcast_temperature_2m', 'graphcast_total_precipitation', 'graphcast_u_component_of_wind_10m']
  Hindcast examples: ['era

## 📊 Dataset Structure Understanding

Based on the analysis above, here's my comprehensive understanding of this dataset:

### **Dataset Structure**

This dataset has a **simpler structure** compared to the previous Harz dataset:

**Dimensions:**
- `date`: 8,401 time steps (initialization/valid times)
- `lead_time`: 10 lead time steps (forecast horizon)

**Key Differences from Harz Dataset:**
1. **No basin dimension** - This appears to be for a single location/basin (basin ID: 23006)
2. **Shorter forecast horizon** - Only 10 lead time steps vs 240 in the Harz dataset
3. **Mixed data sources** - Combines GraphCast forecasts with ERA5 reanalysis

### **Variable Types**

**🔴 Forecast Variables (with lead_time dimension):**
- `graphcast_temperature_2m`: (date, lead_time)
- `graphcast_total_precipitation`: (date, lead_time) 
- `graphcast_u_component_of_wind_10m`: (date, lead_time)
- `graphcast_v_component_of_wind_10m`: (date, lead_time)

**🔵 Hindcast Variables (no lead_time dimension):**
- `era5land_temperature_2m`: (date,)
- `era5land_total_precipitation`: (date,)
- `era5land_u_component_of_wind_10m`: (date,)
- `era5land_v_component_of_wind_10m`: (date,)
- `era5land_potential_evaporation_FAO_PENMAN_MONTEITH`: (date,)
- `Q`: (date,) - Likely discharge/target variable

### **Data Pattern**

This follows the **same conceptual pattern** as the Harz dataset:
- **Hindcast features**: Historical/reanalysis data indexed only by time
- **Forecast features**: Forecast data indexed by time and lead_time
- **Different data sources**: ERA5 for historical, GraphCast for forecasts (vs GEFS in Harz dataset)

The key insight is that this dataset structure allows models to:
1. Learn from historical context (ERA5 reanalysis data)
2. Make predictions using forecast inputs (GraphCast forecasts)
3. Predict targets like discharge (`Q`) using both historical and forecast information