In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

print("=== CATALONIA WILDFIRE DATASET - COMPREHENSIVE EDA ===\n")

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

=== CATALONIA WILDFIRE DATASET - COMPREHENSIVE EDA ===



In [2]:
# Load the demo dataset (the processed one currently used)
print("Loading demo dataset...")
try:
    ds_demo = xr.open_dataset("data/IberFire_demo.nc", chunks={'time': 100})
    print(f"✓ Demo dataset loaded: {ds_demo.dims}")
except FileNotFoundError:
    print("✗ Demo dataset not found")
    ds_demo = None

# Load the parquet version
try:
    df_demo = pd.read_parquet("data/IberFire_demo.parquet")
    print(f"✓ Demo parquet loaded: {df_demo.shape}")
except FileNotFoundError:
    print("✗ Demo parquet not found")
    df_demo = None

Loading demo dataset...
✗ Demo dataset not found
✗ Demo parquet not found


In [3]:
# Load the original Catalonia dataset to understand the full scope
print("\nLoading original Catalonia dataset...")
try:
    ds_full = xr.open_dataset("data/iberfire_catalonia.nc", chunks='auto')
    print(f"✓ Full Catalonia dataset loaded: {ds_full.dims}")
    print(f"Time range: {ds_full.time.min().values} to {ds_full.time.max().values}")
    print(f"Variables: {len(ds_full.data_vars)} total")
except FileNotFoundError:
    print("✗ Full Catalonia dataset not found")
    ds_full = None


Loading original Catalonia dataset...
✗ Full Catalonia dataset not found


In [None]:
# === BASIC DATASET INFORMATION ===
print("\n" + "="*50)
print("BASIC DATASET INFORMATION")
print("="*50)

if ds_demo is not None:
    print(f"Demo Dataset Dimensions: {dict(ds_demo.dims)}")
    print(f"Demo Dataset Size: {ds_demo.nbytes / 1e9:.2f} GB")
    print(f"Spatial extent: X[{ds_demo.x.min().values:.0f}, {ds_demo.x.max().values:.0f}], Y[{ds_demo.y.min().values:.0f}, {ds_demo.y.max().values:.0f}]")
    print(f"Coordinate System: EPSG:3035 (European grid)")
    print(f"Temporal coverage: {len(ds_demo.time)} days from {ds_demo.time.min().values} to {ds_demo.time.max().values}")

if ds_full is not None:
    print(f"\nFull Dataset Dimensions: {dict(ds_full.dims)}")
    print(f"Available variables: {len(ds_full.data_vars)}")

In [None]:
# === TEMPORAL NaN PATTERNS ===
print("\n" + "="*50)
print("TEMPORAL NaN PATTERNS")
print("="*50)

if ds_demo is not None:
    # Sample subset for analysis (every 10th pixel to reduce computation)
    sample_ds = ds_demo.isel(x=slice(0, None, 10), y=slice(0, None, 10))
    
    # Calculate NaN percentage by time for key variables
    key_vars = ['t2m_mean', 'RH_mean', 'FWI', 'NDVI', 'LAI']
    temporal_nan = {}
    
    for var in key_vars:
        if var in sample_ds.data_vars:
            # Compute NaN percentage across space for each time step
            nan_by_time = sample_ds[var].isnull().mean(dim=['x', 'y']).compute()
            temporal_nan[var] = nan_by_time
            
            # Show statistics
            print(f"{var}:")
            print(f"  Days with >50% NaN: {(nan_by_time > 0.5).sum().values}")
            print(f"  Days with >90% NaN: {(nan_by_time > 0.9).sum().values}")
            print(f"  Days with 0% NaN: {(nan_by_time == 0).sum().values}")