In [1]:
# Force garbage collection
import gc
gc.collect()

34

In [2]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import pickle

In [3]:
col_to_replace = ["population_density_log"]

In [4]:
log_message = []
log_message.append("Tread Analysis")
log_message.append(f"Variable to replace: {col_to_replace}")
log_message.append(f"Current datetime: {pd.Timestamp.now()}")

In [5]:
yr = 2004
Eval_2004 = pd.read_parquet(f'../Clean_Data/Model_Data/Evaluation/Features_w_Label/Human_Fire_Water_Year/{yr}_features_w_label.parquet')

In [7]:
Eval_2004["lon"] = Eval_2004["lon"].round(6)
Eval_2004["lat"] = Eval_2004["lat"].round(6)

In [8]:
Eval_2004 = Eval_2004[['lon', 'lat'] + col_to_replace]
# drop duplicates based on all columns
Eval_2004 = Eval_2004.drop_duplicates()

In [9]:
if not Eval_2004.duplicated(subset=['lon', 'lat']).any():
    log_message.append("2004 lon, lat are unique")
else:
    print("2004 lon, lat are not unique")

In [10]:
years = range(2007, 2021)

# loop through the years and get progress bar
for year in tqdm(years):
    log_message.append("-" * 50)
    log_message.append(f"Processing year: {year}")
    # read the data
    Eval_data = pd.read_csv(f'../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year/{year}_predictions.csv',
                            parse_dates=["day"],
                            dtype={"NWCG_CAUSE_CLASSIFICATION": "str",
                               "veg_type_details": "str",
                               "fire_attribute": "str"
                            })
    # log range of day
    log_message.append(f"Range of day: {Eval_data['day'].min()} - {Eval_data['day'].max()}")

    Eval_data['month'] = Eval_data['day'].dt.month

    Eval_data = Eval_data.drop(columns=col_to_replace)
    Eval_data["lon"] = Eval_data["lon"].round(6)
    Eval_data["lat"] = Eval_data["lat"].round(6)

    # log the shape of the dataframes
    log_message.append(f"Eval_data shape before merge: {Eval_data.shape}")
    # merge the two dataframes on lon, lat, and day
    Eval_reset = Eval_data.merge(Eval_2004, on=['lon', 'lat'], how = "inner")
    log_message.append(f"Eval_data shape after merge: {Eval_reset.shape}")

    # read the model
    with open(f'../Model/6_water_year_completed/predict_{year}_6yr_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    features = loaded_model.get_booster().feature_names

    # predict the probability of fire for 2020 using the loaded model
    Eval_reset['predictions_update'] = loaded_model.predict_proba(Eval_reset[features])[:, 1]

    # save it to parquet
    Eval_reset.to_parquet(f'../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_tread_analysis/{year}_predictions_using_2005_population.parquet', index=False)

    # free up memory
    del Eval_data
    del Eval_reset
    gc.collect()

  0%|          | 0/14 [00:00<?, ?it/s]

100%|██████████| 14/14 [25:10<00:00, 107.93s/it]


In [11]:
# Save the log messages to a log file
with open('../Logs/tread_analysis_water_year_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_message))

print("Log file saved to '../Logs/tread_analysis_water_year_log.txt'")

Log file saved to '../Logs/tread_analysis_water_year_log.txt'


In [None]:
def plot_comparison_mult(Eval_grouped, year = 2000, plot = True):
    fig, axes = plt.subplots(2, 1, figsize=(12, 18), sharex=True)

    # Define labels and titles
    #metrics = ['median', 'q25', 'q75']
    metrics = ['median','q75']
    titles = ["Median Comparison", "Q75 Comparison"]
    colors = ['blue', 'red']

    for i, metric in enumerate(metrics):
        ax = axes[i]
        
        # Plot updated values
        ax.plot(Eval_grouped['month'], Eval_grouped[f'predictions_update_{metric}'], marker='o', color=colors[0], label=f'Updated {metric}')
        
        # Plot old values
        ax.plot(Eval_grouped['month'], Eval_grouped[f'predictions_{metric}'], marker='o', color=colors[1], label=f'Old {metric}')
        
        ax.set_title(titles[i])
        ax.set_xlabel("month")
        ax.set_ylabel(f"{metric.capitalize()} Value")
        ax.legend()
        ax.grid(True)


    plt.tight_layout()
    if plot:
        plt.show()
    else:
        plt.savefig(f'../Plot/Water_Year/Tread_Analysis/{year}_comparison_plot.png')
        plt.close()

In [19]:
def plot_comparison(Eval_grouped, year=2000, plot=True):
    fig, ax = plt.subplots(figsize=(12, 6))  # Single plot instead of multiple subplots

    metric = 'median'  # Keeping only one metric
    title = "Median Comparison"
    colors = ['blue', 'red']

    # Plot updated values
    ax.plot(Eval_grouped['month'], Eval_grouped[f'predictions_update_{metric}'], marker='o', color=colors[0], label=f'Updated {metric}')

    # Plot old values
    ax.plot(Eval_grouped['month'], Eval_grouped[f'predictions_{metric}'], marker='o', color=colors[1], label=f'Old {metric}')

    ax.set_title(title)
    ax.set_xlabel("month")
    ax.set_ylabel(f"{metric.capitalize()} Value")
    ax.legend()
    ax.grid(True)

    plt.tight_layout()
    if plot:
        plt.show()
    else:
        plt.savefig(f'../Plot/Water_Year/Tread_Analysis/{year}_comparison_plot.png')
        plt.close()

In [30]:
years = range(2007, 2021)


for year in tqdm(years):
   
    Eval_data = pd.read_parquet(f'../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/water_year_tread_analysis/{year}_predictions_using_2005_population.parquet')
    
    Eval_grouped = Eval_data.groupby('month').agg(
    predictions_update_median=('predictions_update', 'median'),
    #predictions_update_q25=('predictions_update', lambda x: x.quantile(0.25)),
    predictions_update_q75=('predictions_update', lambda x: x.quantile(0.75)),
    predictions_median=('predictions', 'median'),
    #predictions_q25=('predictions', lambda x: x.quantile(0.25)),
    predictions_q75=('predictions', lambda x: x.quantile(0.75))
    ).reset_index()

    plot_comparison_mult(Eval_grouped, year = year, plot = False)

    # free up memory
    del Eval_data
    gc.collect()

100%|██████████| 14/14 [01:10<00:00,  5.07s/it]
