In [1]:
# Force garbage collection
import gc
gc.collect()

41

In [2]:
import pandas as pd
import geopandas as gpd
import xarray as xr
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import xgboost as xgb

In [3]:
import pickle

In [4]:
data_path = '../../Clean_Data/climatology'
reference_weather = pd.read_parquet(os.path.join(data_path, 'climatology_2001_2020_06292025_w_lai.parquet'))
reference_weather = reference_weather.drop(columns=['LAI'])

In [5]:
reference_weather.columns

Index(['lon', 'lat', 'date', 'dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 'max_air_temperature',
       'max_relative_humidity', 'min_air_temperature', 'min_relative_humidity',
       'precipitation_amount', 'specific_humidity',
       'surface_downwelling_shortwave_flux_in_air', 'wind_speed', 'SWE'],
      dtype='object')

In [6]:
# check if lon, lat, date are unique
assert reference_weather[['lon', 'lat', 'date']].drop_duplicates().shape[0] == reference_weather.shape[0], "Lon, lat, date should be unique"

In [7]:
significant_tread_variables =['dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 
       'max_air_temperature', 
       'max_relative_humidity', 
       'min_air_temperature', 
       'min_relative_humidity', 
       'precipitation_amount',
       'specific_humidity', 
       'surface_downwelling_shortwave_flux_in_air',
       'wind_speed', 
       'SWE'
       ]

In [8]:
for var in significant_tread_variables:
    if var not in reference_weather.columns:
        print(f"{var} not found in reference data")

In [9]:
reference_weather.shape

(4771454, 14)

In [10]:
reference_weather.dtypes

lon                                          float64
lat                                          float64
date                                          object
dead_fuel_moisture_1000hr                    float32
dead_fuel_moisture_100hr                     float32
max_air_temperature                          float64
max_relative_humidity                        float32
min_air_temperature                          float64
min_relative_humidity                        float32
precipitation_amount                         float32
specific_humidity                            float32
surface_downwelling_shortwave_flux_in_air    float32
wind_speed                                   float32
SWE                                          float32
dtype: object

In [11]:
reference_weather = reference_weather[['lon', 'lat', 'date'] + significant_tread_variables]

In [16]:
model_version = "Extended_Data_Water_Year_no_riparian_desert_wetland_barren_group_veg"
model_path = f'../../Model/{model_version}'
input_predictions_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/parquet'
output_predictions_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/CLIM'  
if not os.path.exists(output_predictions_path):
    os.makedirs(output_predictions_path)

In [19]:
log_messages = []
log_messages.append("Climate change evaluation")
# add log to record the current time
log_messages.append(f"Start time: {pd.Timestamp.now()}")
log_messages.append("The following variables are fixed at the average level for the climate change evaluation:")
log_messages.extend(significant_tread_variables)
# Define the range of years
years = range(2001, 2021)

# Iterate over the years with a progress bar
for yr in tqdm(years, desc="Processing years"):
    log_messages.append("-" * 50)

    Eval_data = pd.read_parquet(f'{input_predictions_path}/{yr}_predictions.parquet')
    Eval_data['date'] = Eval_data['day'].dt.strftime('%m-%d')
    # record shape of Eval_data
    shape_before = Eval_data.shape
    # log the day range of Eval_data
    log_messages.append(f"Day range of Eval_data for year {yr}: {Eval_data['day'].min()} to {Eval_data['day'].max()}")
    # drop significant_tread variables from Eval_data
    Eval_data = Eval_data.drop(columns=significant_tread_variables)
    # check the common columns in both dataframes and log the result
    # common_columns = set(Eval_data.columns).intersection(set(reference_weather.columns))
    # log_messages.append(f"Common columns between reference weather and year {yr}: {common_columns}")
    # merge Eval_data with reference_weather on lon, lat, and date
    Eval_data = Eval_data.merge(reference_weather, on=['lon', 'lat', 'date'], how='inner')
    shape_after = Eval_data.shape
    # assert error if before and after shapes are not the same: both shape[0] and shape[1] should be the same
    if shape_before[0] != shape_after[0] or shape_before[1] != shape_after[1]:
        raise ValueError(f"Shape mismatch after merging for year {yr}: before {shape_before}, after {shape_after}")

    with open(f'{model_path}/predict_{yr}_6yr_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    features = loaded_model.get_booster().feature_names

    # predict the probability sof fire for 2020 using the loaded model
    Eval_data['predictions_update'] = loaded_model.predict_proba(Eval_data[features])[:, 1]

    # save the predictions to a parquet file
    Eval_data.to_parquet(f'{output_predictions_path}/{yr}_predictions.parquet', index=False)

    # clean up memory
    del Eval_data
    gc.collect()

Processing years: 100%|██████████| 20/20 [15:24<00:00, 46.21s/it]


In [20]:
# Save the log messages to a log file
with open('../../Logs/Clean_Extended_Data/tread_analysis_Extended_Data_Water_Year_no_4_group_veg.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

case study

In [22]:
input_parquet_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/CLIM'
Eval_data = pd.read_parquet(f'{input_parquet_path}/2007_predictions.parquet')

In [23]:
Eval_data.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
year                                                  int32
wind_from_direction                                 float32
population_density                                  float64
LAI                                                 float64
IS_FIRE                                               int32
min_FIRE_SIZE                                       float64
max_FIRE_SIZE                                       float64
fire_attribute                                       object
veg                                                  object
slope_avg                                           float32
slope_max                                           float32
road_density_km_km2                                 float64
line_density_km_per_cell                            float64
SubRegion                               

In [24]:
Eval_data['Month'] = Eval_data['day'].dt.month

In [25]:
Eval_reset_climate_grouped = Eval_data.groupby('Month').agg(
    predictions_update_median=('predictions_update', 'median'),
     predictions_median=('predictions', 'median'),
    predictions_update_q25=('predictions_update', lambda x: x.quantile(0.25)),
    predictions_q25=('predictions', lambda x: x.quantile(0.25)),
    predictions_update_q75=('predictions_update', lambda x: x.quantile(0.75)), 
    predictions_q75=('predictions', lambda x: x.quantile(0.75))
).reset_index()

Eval_reset_climate_grouped

Unnamed: 0,Month,predictions_update_median,predictions_median,predictions_update_q25,predictions_q25,predictions_update_q75,predictions_q75
0,1,0.000279,0.001196,8.5e-05,0.000237,0.001007,0.004119
1,2,0.000325,0.000639,9.6e-05,0.000143,0.001076,0.002451
2,3,0.00045,0.001904,0.000134,0.000471,0.001556,0.005349
3,4,0.001135,0.002996,0.000354,0.000959,0.003758,0.007924
4,5,0.004516,0.006821,0.001549,0.003075,0.011473,0.01708
5,6,0.00863,0.009726,0.004278,0.004903,0.022367,0.024875
6,7,0.011191,0.011497,0.006056,0.005788,0.030375,0.029954
7,8,0.009649,0.00996,0.005535,0.005322,0.026701,0.02675
8,9,0.007717,0.00718,0.004472,0.003785,0.020272,0.017773
9,10,0.004603,0.005896,0.002199,0.002828,0.011518,0.013496
