In [2]:
import xarray as xr
import pandas as pd
import logging

In [3]:
ds = xr.open_dataset("data/iberfire_catalonia.nc", chunks="auto")

In [4]:
# Select data from 2020 and onward
ds_2020_onward = ds.sel(time=slice("2020-01-01", None))

In [7]:
# List variable names and their dimensions
for var_name, var in ds_2020_onward.variables.items():
    print(f"{var_name}: {var.dims}")

x_index: ('y', 'x')
y_index: ('y', 'x')
is_spain: ('y', 'x')
is_fire: ('time', 'y', 'x')
is_near_fire: ('time', 'y', 'x')
x_coordinate: ('y', 'x')
y_coordinate: ('y', 'x')
is_sea: ('y', 'x')
is_waterbody: ('y', 'x')
AutonomousCommunities: ('y', 'x')
CLC_2006_1: ('y', 'x')
CLC_2006_2: ('y', 'x')
CLC_2006_3: ('y', 'x')
CLC_2006_4: ('y', 'x')
CLC_2006_5: ('y', 'x')
CLC_2006_6: ('y', 'x')
CLC_2006_7: ('y', 'x')
CLC_2006_8: ('y', 'x')
CLC_2006_9: ('y', 'x')
CLC_2006_10: ('y', 'x')
CLC_2006_11: ('y', 'x')
CLC_2006_12: ('y', 'x')
CLC_2006_13: ('y', 'x')
CLC_2006_14: ('y', 'x')
CLC_2006_15: ('y', 'x')
CLC_2006_16: ('y', 'x')
CLC_2006_17: ('y', 'x')
CLC_2006_18: ('y', 'x')
CLC_2006_19: ('y', 'x')
CLC_2006_20: ('y', 'x')
CLC_2006_21: ('y', 'x')
CLC_2006_22: ('y', 'x')
CLC_2006_23: ('y', 'x')
CLC_2006_24: ('y', 'x')
CLC_2006_25: ('y', 'x')
CLC_2006_26: ('y', 'x')
CLC_2006_27: ('y', 'x')
CLC_2006_28: ('y', 'x')
CLC_2006_29: ('y', 'x')
CLC_2006_30: ('y', 'x')
CLC_2006_31: ('y', 'x')
CLC_2006_32: ('

In [8]:
# Select a subset of features
selected_features = [
    "is_fire", "is_near_fire",  # Fire-related
    "t2m_mean", "RH_mean", "FWI",  # Weather
    "NDVI", "LAI",  # Vegetation
    "elevation_mean", "slope_mean"  # Terrain
]

# Filter the dataset to include only the selected features
ds_filtered = ds_2020_onward[selected_features]

# Display the filtered dataset
print(ds_filtered)

<xarray.Dataset> Size: 4GB
Dimensions:         (time: 1827, y: 255, x: 281)
Coordinates:
  * x               (x) float64 2kB 3.489e+06 3.49e+06 ... 3.768e+06 3.769e+06
  * y               (y) float64 2kB 2.242e+06 2.241e+06 ... 1.989e+06 1.988e+06
  * time            (time) datetime64[ns] 15kB 2020-01-01 ... 2024-12-31
Data variables:
    is_fire         (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    is_near_fire    (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    t2m_mean        (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    RH_mean         (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    FWI             (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    NDVI            (time, y, x) float32 524MB dask.array<chunksize=(896, 107, 118), meta=np.ndarray>
    LAI             (time, y, x) float32 524MB dask

In [9]:
# Convert the xarray dataset to a pandas DataFrame
df_filtered = ds_filtered.to_dataframe().reset_index()

# Save the DataFrame as a CSV file
df_filtered.to_csv("data/IberFire_demo.csv", index=False)

print("Dataset saved as data/IberFire_demo.csv")

Dataset saved as data/IberFire_demo.csv


In [10]:
# load the saved CSV to verify
df_loaded = pd.read_csv("data/IberFire_demo.csv")
# Display the first few rows of the loaded DataFrame
print(df_loaded.head())

         time             y             x  is_fire  is_near_fire  t2m_mean  \
0  2020-01-01  2.242196e+06  3.488734e+06      NaN           NaN       NaN   
1  2020-01-01  2.242196e+06  3.489734e+06      NaN           NaN       NaN   
2  2020-01-01  2.242196e+06  3.490734e+06      NaN           NaN       NaN   
3  2020-01-01  2.242196e+06  3.491734e+06      NaN           NaN       NaN   
4  2020-01-01  2.242196e+06  3.492734e+06      NaN           NaN       NaN   

   RH_mean  FWI  NDVI  LAI  elevation_mean  slope_mean  
0      NaN  NaN   NaN  NaN             NaN         NaN  
1      NaN  NaN   NaN  NaN             NaN         NaN  
2      NaN  NaN   NaN  NaN             NaN         NaN  
3      NaN  NaN   NaN  NaN             NaN         NaN  
4      NaN  NaN   NaN  NaN             NaN         NaN  


In [11]:
# see if there are any na values
df_loaded.isna().sum()

time                     0
y                        0
x                        0
is_fire           72263331
is_near_fire      72263331
t2m_mean          72782199
RH_mean           72782199
FWI               72263331
NDVI              72526742
LAI               72847271
elevation_mean    72263331
slope_mean        72263331
dtype: int64

In [12]:
df_loaded.shape

(130913685, 12)

In [13]:
# Calculate the total number of elements in the dataset
total_elements = df_loaded.size

# Calculate the total number of NaN values
total_nans = df_loaded.isna().sum().sum()

# Calculate the percentage of NaN values
nan_percentage = (total_nans / total_elements) * 100

print(f"Total elements: {total_elements}")
print(f"Total NaNs: {total_nans}")
print(f"Percentage of NaNs: {nan_percentage:.2f}%")

Total elements: 1570964220
Total NaNs: 652255066
Percentage of NaNs: 41.52%


In [15]:
# Calculate the total number of elements in df_filtered
total_elements_filtered = df_filtered.size

# Calculate the total number of NaN values in df_filtered
total_nans_filtered = df_filtered.isna().sum().sum()

# Calculate the percentage of NaN values in df_filtered
nan_percentage_filtered = (total_nans_filtered / total_elements_filtered) * 100

print(f"Total elements in df_filtered: {total_elements_filtered}")
print(f"Total NaNs in df_filtered: {total_nans_filtered}")
print(f"Percentage of NaNs in df_filtered: {nan_percentage_filtered:.2f}%")

Total elements in df_filtered: 1570964220
Total NaNs in df_filtered: 652255066
Percentage of NaNs in df_filtered: 41.52%


In [19]:
# Calculate the total number of elements in the xarray dataset
total_elements_filtered = ds.size

# Calculate the total number of NaN values in the xarray dataset
total_nans_filtered = ds.isnull().sum().item()

# Calculate the percentage of NaN values in the xarray dataset
nan_percentage_filtered = (total_nans_filtered / total_elements_filtered) * 100

print(f"Total elements in ds: {total_elements_filtered}")
print(f"Total NaNs in ds: {total_nans_filtered}")
print(f"Percentage of NaNs in ds: {nan_percentage_filtered:.2f}%")

AttributeError: 'Dataset' object has no attribute 'size'