In [1]:
# Force garbage collection
import gc
gc.collect()

41

In [2]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import roc_curve

In [3]:
import pickle

In [4]:
data_path = '../Clean_Data/climatology'
reference_weather = pd.read_parquet(os.path.join(data_path, 'climatology_2001_2020.parquet'))

In [5]:
reference_weather.columns

Index(['lon', 'lat', 'date', 'dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 'max_air_temperature',
       'max_relative_humidity', 'min_air_temperature', 'min_relative_humidity',
       'precipitation_amount', 'specific_humidity',
       'surface_downwelling_shortwave_flux_in_air', 'wind_from_direction',
       'wind_speed', 'SWE', 'pdsi'],
      dtype='object')

In [6]:
# check if lon, lat, date are unique
assert reference_weather[['lon', 'lat', 'date']].drop_duplicates().shape[0] == reference_weather.shape[0], "Lon, lat, date should be unique"

In [8]:
significant_tread_variables = ['dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 
       'max_air_temperature', 
       'max_relative_humidity', 
       'min_air_temperature', 
       'min_relative_humidity', 
       'precipitation_amount',
       'specific_humidity', 
       'surface_downwelling_shortwave_flux_in_air',
       'SWE', 
       'pdsi' 
       ]

In [9]:
# check if significant_tread variables are in the Eval_2006 and Eval_2020 dataframes
for var in significant_tread_variables:
    if var not in reference_weather.columns:
        print(f"{var} not found in reference data")

In [10]:
reference_weather.shape

(6340218, 16)

In [11]:
reference_weather.dtypes

lon                                          float64
lat                                          float64
date                                          object
dead_fuel_moisture_1000hr                    float32
dead_fuel_moisture_100hr                     float32
max_air_temperature                          float64
max_relative_humidity                        float32
min_air_temperature                          float64
min_relative_humidity                        float32
precipitation_amount                         float32
specific_humidity                            float32
surface_downwelling_shortwave_flux_in_air    float32
wind_from_direction                          float32
wind_speed                                   float32
SWE                                          float32
pdsi                                         float32
dtype: object

In [13]:
reference_weather = reference_weather[['lon', 'lat', 'date'] + significant_tread_variables]

In [15]:
model_path = '../Model/6_water_year_completed_filter_veg/original_road_population_density'
input_predictions_path = '../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_filter_veg_original_road_population_density/parquet'
output_predictions_path = '../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_filter_veg_original_road_population_density/tread_analysis/parquet'  
if not os.path.exists(output_predictions_path):
    os.makedirs(output_predictions_path)

In [16]:
log_messages = []
log_messages.append("Climate change evaluation")
# add log to record the current time
log_messages.append(f"Start time: {pd.Timestamp.now()}")
log_messages.append("The following variables are fixed at the average level for the climate change evaluation:")
log_messages.extend(significant_tread_variables)
# Define the range of years
years = range(2007, 2021)

# Iterate over the years with a progress bar
for yr in tqdm(years, desc="Processing years"):
    log_messages.append("-" * 50)

    Eval_data = pd.read_parquet(f'{input_predictions_path}/{yr}_predictions.parquet')
    Eval_data['date'] = Eval_data['day'].dt.strftime('%m-%d')
    # log the shape of Eval_data
    log_messages.append(f"Shape of Eval_data for year {yr}: {Eval_data.shape}")
    # drop significant_tread variables from Eval_data
    Eval_data = Eval_data.drop(columns=significant_tread_variables)
    # check the common columns in both dataframes and log the result
    common_columns = set(Eval_data.columns).intersection(set(reference_weather.columns))
    log_messages.append(f"Common columns between reference weather and year {yr}: {common_columns}")
    # merge Eval_data with reference_weather on lon, lat, and date
    Eval_data = Eval_data.merge(reference_weather, on=['lon', 'lat', 'date'], how='inner')
    # log the shape of Eval_data after merging
    log_messages.append(f"Shape of Eval_data after merging with reference weather for year {yr}: {Eval_data.shape}")


    with open(f'{model_path}/predict_{yr}_6yr_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    features = loaded_model.get_booster().feature_names

    # predict the probability sof fire for 2020 using the loaded model
    Eval_data['predictions_update'] = loaded_model.predict_proba(Eval_data[features])[:, 1]

    # save the predictions to a parquet file
    Eval_data.to_parquet(f'{output_predictions_path}/{yr}_predictions_update.parquet', index=False)

    # clean up memory
    del Eval_data
    gc.collect()

Processing years: 100%|██████████| 14/14 [18:00<00:00, 77.18s/it] 


In [17]:
# Save the log messages to a log file
with open('../Logs/tread_analysis_water_year_no_log_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

print("Log file saved to '../Logs/tread_analysis_water_year_no_log_log.txt'")

Log file saved to '../Logs/tread_analysis_water_year_no_log_log.txt'


In [25]:
input_parquet_path = '../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_filter_veg_original_road_population_density/tread_analysis/parquet'
output_csv_path = '../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_filter_veg_original_road_population_density/tread_analysis/csv'  
if not os.path.exists(output_csv_path):
    os.makedirs(output_csv_path)

years = range(2007, 2021)
for yr in tqdm(years, desc="Processing years"):
    Eval_data = pd.read_parquet(f'{input_parquet_path}/{yr}_predictions_update.parquet')
    Eval_data.to_csv(f'{output_csv_path}/{yr}_predictions_update.csv', index=False)

    del Eval_data
    gc.collect()

Processing years: 100%|██████████| 14/14 [52:29<00:00, 224.99s/it]


case study

In [18]:
input_parquet_path = '../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_filter_veg_original_road_population_density/tread_analysis/parquet'
Eval_data = pd.read_parquet(f'{input_parquet_path}/2018_predictions_update.parquet')

In [20]:
Eval_data.dtypes

lon                                                 float64
lat                                                 float64
day                                          datetime64[ns]
pdsi_pre_interpolated                               float32
pdsi_class                                          float32
max_wind_speed                                      float32
wind_from_direction                                 float32
wind_speed                                          float32
population_density                                  float32
LAI                                                 float32
population_density_log                              float32
IS_FIRE                                               int32
NWCG_CAUSE_CLASSIFICATION                            object
min_FIRE_SIZE                                       float64
max_FIRE_SIZE                                       float64
Year                                                  int32
veg_type_details                        

In [21]:
Eval_data['Month'] = Eval_data['day'].dt.month

In [23]:
Eval_reset_climate_grouped = Eval_data.groupby('Month').agg(
    predictions_update_median=('predictions_update', 'median'),
     predictions_median=('predictions', 'median'),
    predictions_update_q25=('predictions_update', lambda x: x.quantile(0.25)),
    predictions_q25=('predictions', lambda x: x.quantile(0.25)),
    predictions_update_q75=('predictions_update', lambda x: x.quantile(0.75)), 
    predictions_q75=('predictions', lambda x: x.quantile(0.75))
).reset_index()

Eval_reset_climate_grouped

Unnamed: 0,Month,predictions_update_median,predictions_median,predictions_update_q25,predictions_q25,predictions_update_q75,predictions_q75
0,1,0.000113,0.000336,2.9e-05,9.7e-05,0.000463,0.001092
1,2,0.000152,0.00078,3.5e-05,0.000238,0.000658,0.002371
2,3,0.000251,0.000208,5.5e-05,4.5e-05,0.001127,0.000857
3,4,0.000571,0.000788,0.000125,0.00022,0.00265,0.002731
4,5,0.00164,0.001799,0.000547,0.000716,0.006757,0.00606
5,6,0.002873,0.003021,0.001408,0.001554,0.010932,0.009913
6,7,0.003594,0.003749,0.00205,0.002061,0.012079,0.011204
7,8,0.003428,0.00365,0.001862,0.001861,0.009576,0.009175
8,9,0.003127,0.0033,0.001496,0.001529,0.007877,0.007724
9,10,0.001638,0.002391,0.000632,0.000952,0.005279,0.006624
