In [9]:
import xarray as xr
import pandas as pd
import logging

In [10]:
ds = xr.open_dataset("data/iberfire_catalonia.nc", chunks="auto")

In [11]:
# Select data from 2020 and onward
ds_2020_onward = ds.sel(time=slice("2020-01-01", None))

In [12]:
# List variable names and their dimensions
for var_name, var in ds_2020_onward.variables.items():
    print(f"{var_name}: {var.dims}")

x_index: ('y', 'x')
y_index: ('y', 'x')
is_spain: ('y', 'x')
is_fire: ('time', 'y', 'x')
is_near_fire: ('time', 'y', 'x')
x_coordinate: ('y', 'x')
y_coordinate: ('y', 'x')
is_sea: ('y', 'x')
is_waterbody: ('y', 'x')
AutonomousCommunities: ('y', 'x')
CLC_2006_1: ('y', 'x')
CLC_2006_2: ('y', 'x')
CLC_2006_3: ('y', 'x')
CLC_2006_4: ('y', 'x')
CLC_2006_5: ('y', 'x')
CLC_2006_6: ('y', 'x')
CLC_2006_7: ('y', 'x')
CLC_2006_8: ('y', 'x')
CLC_2006_9: ('y', 'x')
CLC_2006_10: ('y', 'x')
CLC_2006_11: ('y', 'x')
CLC_2006_12: ('y', 'x')
CLC_2006_13: ('y', 'x')
CLC_2006_14: ('y', 'x')
CLC_2006_15: ('y', 'x')
CLC_2006_16: ('y', 'x')
CLC_2006_17: ('y', 'x')
CLC_2006_18: ('y', 'x')
CLC_2006_19: ('y', 'x')
CLC_2006_20: ('y', 'x')
CLC_2006_21: ('y', 'x')
CLC_2006_22: ('y', 'x')
CLC_2006_23: ('y', 'x')
CLC_2006_24: ('y', 'x')
CLC_2006_25: ('y', 'x')
CLC_2006_26: ('y', 'x')
CLC_2006_27: ('y', 'x')
CLC_2006_28: ('y', 'x')
CLC_2006_29: ('y', 'x')
CLC_2006_30: ('y', 'x')
CLC_2006_31: ('y', 'x')
CLC_2006_32: ('

In [13]:
# Select a subset of features
selected_features = [
    "is_fire", "is_near_fire",  # Fire-related
    "t2m_mean", "RH_mean", "FWI",  # Weather
    "NDVI", "LAI",  # Vegetation
    "elevation_mean", "slope_mean"  # Terrain
]

# Filter the dataset to include only the selected features
ds_filtered = ds_2020_onward[selected_features]

# Display the filtered dataset
print(ds_filtered)

<xarray.Dataset> Size: 4GB
Dimensions:         (time: 1827, y: 255, x: 281)
Coordinates:
  * x               (x) float64 2kB 3.489e+06 3.49e+06 ... 3.768e+06 3.769e+06
  * y               (y) float64 2kB 2.242e+06 2.241e+06 ... 1.989e+06 1.988e+06
  * time            (time) datetime64[ns] 15kB 2020-01-01 ... 2024-12-31
Data variables:
    is_fire         (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    is_near_fire    (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    t2m_mean        (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    RH_mean         (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    FWI             (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    NDVI            (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    LAI             (time, y, x) float32 524MB dask

In [14]:
# Convert the xarray dataset to a pandas DataFrame
df_filtered = ds_filtered.to_dataframe().reset_index()

# Extract numerical Features from `time`
df_filtered['year'] = df_filtered['time'].dt.year
df_filtered['month'] = df_filtered['time'].dt.month
df_filtered['day'] = df_filtered['time'].dt.day
# Drop the original 'time' column
df_filtered.drop(columns=['time'], inplace=True)
# Rename columns for clarity
df_filtered.rename(columns={
    't2m_mean': 'temperature_mean',
    'RH_mean': 'relative_humidity_mean',
    'FWI': 'fire_weather_index',
    'NDVI': 'normalized_difference_vegetation_index',
    'LAI': 'leaf_area_index',
    'elevation_mean': 'mean_elevation',
    'slope_mean': 'mean_slope'
}, inplace=True)

# Save the DataFrame as a parquet file
df_filtered.to_parquet("data/IberFire_demo.parquet", index=False)

print("Dataset saved as data/IberFire_demo.parquet")

Dataset saved as data/IberFire_demo.parquet


In [15]:
# load the saved file to verify
df_loaded = pd.read_parquet("data/IberFire_demo.parquet")
print(df_loaded.dtypes)

y                                         float64
x                                         float64
is_fire                                   float32
is_near_fire                              float32
temperature_mean                          float32
relative_humidity_mean                    float32
fire_weather_index                        float32
normalized_difference_vegetation_index    float32
leaf_area_index                           float32
mean_elevation                            float32
mean_slope                                float32
year                                        int32
month                                       int32
day                                         int32
dtype: object


In [11]:
# see if there are any na values
df_loaded.isna().sum()

time                     0
y                        0
x                        0
is_fire           72263331
is_near_fire      72263331
t2m_mean          72782199
RH_mean           72782199
FWI               72263331
NDVI              72526742
LAI               72847271
elevation_mean    72263331
slope_mean        72263331
dtype: int64

In [12]:
df_loaded.shape

(130913685, 12)

In [13]:
# Calculate the total number of elements in the dataset
total_elements = df_loaded.size

# Calculate the total number of NaN values
total_nans = df_loaded.isna().sum().sum()

# Calculate the percentage of NaN values
nan_percentage = (total_nans / total_elements) * 100

print(f"Total elements: {total_elements}")
print(f"Total NaNs: {total_nans}")
print(f"Percentage of NaNs: {nan_percentage:.2f}%")

Total elements: 1570964220
Total NaNs: 652255066
Percentage of NaNs: 41.52%
