In [1]:
# Force garbage collection
import gc
gc.collect()

41

In [2]:
import os

In [3]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import precision_recall_curve,auc
import warnings
import pickle
from sklearn.metrics import confusion_matrix

In [4]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2


In [5]:
initial_features = ['dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 
       'max_air_temperature', 'max_relative_humidity', 
       'min_air_temperature', 'min_relative_humidity', 'precipitation_amount',
       'specific_humidity', 'surface_downwelling_shortwave_flux_in_air',
       #'wind_from_direction', 
       'wind_speed', 'wind_direction_category', 'SWE',
       'population_density',
       'LAI', 
       #'pdsi', 
       #'IS_FIRE', 
       #'min_FIRE_SIZE', 'max_FIRE_SIZE', 'Year','fire_attribute', 
       'veg', 
       #'slope_avg', 
       'slope_max',
       'road_density_km_km2',
       'line_density_km_per_cell' 
       ]

In [6]:
def train_model(train_data, features, label_col):
    X_train = train_data[features]
    y_train = train_data[label_col]
    # train the model
    model = xgb.XGBClassifier(eval_metric='logloss', tree_method='hist')
    model.fit(X_train, y_train)
    return model

# define function to calculate precision and recall based on a threshold
def calculate_precision_recall(y_true, y_pred_proba, threshold, print_output=False):
    y_pred = (y_pred_proba > threshold).astype(int)
    confusion = confusion_matrix(y_true, y_pred)
    precision = confusion[1, 1] / (confusion[1, 1] + confusion[0, 1])
    recall = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    if print_output:
        print(f'Threshold: {threshold:.2f}')
        print(f'Precision: {precision * 100:.2f}%')
        print(f'Recall: {recall * 100:.2f}%')
        print("Confusion Matrix")
        print(pd.DataFrame(confusion, index=['True Neg', 'True Pos'], columns=['Pred Neg', 'Pred Pos']))
    # get TP, TN, FP, FN
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return TP, TN, FP, FN, precision, recall, f1

def evaluate_model(model, test_data, features, label_col):
    X_test = test_data[features]
    y_test = test_data[label_col]
    # predict the probability of fire
    y_pred = model.predict_proba(X_test)[:, 1]
    # calculate the roc_auc_score
    roc_auc = roc_auc_score(y_test, y_pred)
    # print roc_auc in a sentence
    # print(f"ROC AUC: {roc_auc:.2f}")
    # Calculate precision and recall values
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    # Calculate the area under the precision-recall curve
    auc_pr = auc(recall, precision)
    # print(f"Area Under Precision-Recall Curve (AUC-PR): {auc_pr:.2f}")
    # calculate precision and recall at thresholds 0.5
    TP, TN, FP, FN, precision5, recall5, f15 = calculate_precision_recall(y_test, y_pred, 0.5)
    return roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15

In [8]:
import pandas as pd

# Define the mapping as a list of tuples: (start_year, end_year, model_name)
mapping = [
    (2001, 2004, 'predict_2001_6yr_model'),
    (2005, 2008, 'predict_2005_6yr_model'),
    (2009, 2012, 'predict_2009_6yr_model'),
    (2013, 2016, 'predict_2013_6yr_model'),
    (2017, 2020, 'predict_2017_6yr_model'),
    # Add more as needed
]

# Expand to a DataFrame: each prediction year mapped to its model
lookup_rows = []
for start, end, model in mapping:
    for year in range(start, end + 1):
        lookup_rows.append({'predict_year': year, 'model_name': model})

lookup_df = pd.DataFrame(lookup_rows)

# Example: get model for a given prediction year
def get_model_for_year(year):
    row = lookup_df.loc[lookup_df['predict_year'] == year]
    if not row.empty:
        return row['model_name'].values[0]
    else:
        return None

In [None]:
print(get_model_for_year(2006))  # Output: model_2005

predict_2005_6yr_model


In [10]:
print(lookup_df)

    predict_year              model_name
0           2001  predict_2001_6yr_model
1           2002  predict_2001_6yr_model
2           2003  predict_2001_6yr_model
3           2004  predict_2001_6yr_model
4           2005  predict_2005_6yr_model
5           2006  predict_2005_6yr_model
6           2007  predict_2005_6yr_model
7           2008  predict_2005_6yr_model
8           2009  predict_2009_6yr_model
9           2010  predict_2009_6yr_model
10          2011  predict_2009_6yr_model
11          2012  predict_2009_6yr_model
12          2013  predict_2013_6yr_model
13          2014  predict_2013_6yr_model
14          2015  predict_2013_6yr_model
15          2016  predict_2013_6yr_model
16          2017  predict_2017_6yr_model
17          2018  predict_2017_6yr_model
18          2019  predict_2017_6yr_model
19          2020  predict_2017_6yr_model


In [14]:
results = []
log_messages = []
log_messages.append("Model evaluation")
# add log to record the current time
log_messages.append(f"Start time: {pd.Timestamp.now()}")
# Define the range of years to predict
years = range(2001, 2021)



model_path = '../../Model/Extended_Data_Water_Year'

save_predictions_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/Extended_Data_Water_Year_4_year/parquet'
if not os.path.exists(save_predictions_path):
    os.makedirs(save_predictions_path)  

# surpass the warning
warnings.filterwarnings("ignore")

input_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label/Extended_Data_Water_Year'
# Iterate over the years with a progress bar
for year in tqdm(years, desc="Processing years"):
    log_messages.append("-" * 50)
    # log current water year
    log_messages.append(f"Predict Water Year: {year}")
    # read eval data, prev-year Oct - current year Sep
    Eval_Human = pd.read_parquet(f'{input_path}/{year}_features_w_label.parquet')


    cat_columns = ['wind_direction_category','veg']
    Eval_Human = pd.get_dummies(Eval_Human, columns=cat_columns)

    label_col = 'IS_FIRE'
    model_name = get_model_for_year(year)
    # load the model
    with open(f'{model_path}/{model_name}.pkl', 'rb') as f:
        model = pickle.load(f)

    log_messages.append(f"Model loaded: {model_name}")
    
    features = model.get_booster().feature_names
    # evaluate the model
    roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15 = evaluate_model(model, Eval_Human, features, label_col)
    # append the results to the list
    results.append([year, roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15])

    # add predictions to Eval_Human
    Eval_Human['predictions'] = model.predict_proba(Eval_Human[features])[:, 1]
    # save the predictions to a parquet file
    Eval_Human.to_parquet(f'{save_predictions_path}/{year}_predictions.parquet', index=False)

    del Eval_Human

    # clean the cache
    gc.collect()

Processing years: 100%|██████████| 20/20 [05:32<00:00, 16.61s/it]


In [15]:
# Save the log messages to a log file
with open('../../Logs/Clean_Extended_Data/model_eval_every_4_years.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

In [16]:
# assign column names to the results
results_pd = pd.DataFrame(results, columns=['Year', 'ROC_AUC', 'AUC_PR', 'TP', 'TN', 'FP', 'FN', 'Precision_0.5', 'Recall_0.5', 'F1_0.5'])

In [17]:
results_pd

Unnamed: 0,Year,ROC_AUC,AUC_PR,TP,TN,FP,FN,Precision_0.5,Recall_0.5,F1_0.5
0,2001,0.872694,0.015152,121,4728417,1422,3653,0.078419,0.032061,0.045514
1,2002,0.868813,0.011271,104,4729854,1315,3618,0.073291,0.027942,0.040459
2,2003,0.87022,0.012786,95,4732845,1235,3249,0.071429,0.028409,0.04065
3,2004,0.865636,0.010493,98,4743403,1425,3644,0.064347,0.026189,0.037227
4,2005,0.88264,0.02321,65,4728227,273,3319,0.192308,0.019208,0.034927
5,2006,0.870209,0.027006,98,4719775,250,4113,0.281609,0.023272,0.042992
6,2007,0.865026,0.02967,108,4687758,267,5074,0.288,0.020841,0.03887
7,2008,0.853905,0.02108,100,4697610,419,4416,0.192678,0.022143,0.039722
8,2009,0.882543,0.023328,136,4705515,911,3286,0.129895,0.039743,0.060864
9,2010,0.890455,0.015444,66,4717161,731,2839,0.082811,0.022719,0.035656


In [20]:
# del all variables to free up memory
del mod_Human
del results
del results_pd
# clean the cache
gc.collect()

7

In [21]:
# delete all data
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

In [28]:
import os
import pandas as pd
from tqdm import tqdm
import gc

In [29]:
# save all parquet to csv
years = range(2001, 2021)

input_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/Extended_Data_Water_Year/parquet'
output_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/Extended_Data_Water_Year/csv'

if not os.path.exists(output_path):
    os.makedirs(output_path)

for year in tqdm(years, desc="Processing years"):
    # read the parquet file
    df = pd.read_parquet(f'{input_path}/{year}_predictions.parquet')  
    # write to csv
    df.to_csv(f'{output_path}/{year}_predictions.csv', index=False)

    # clean up the dataframes
    del df
    # clean the cache
    gc.collect()

Processing years: 100%|██████████| 20/20 [55:13<00:00, 165.68s/it]
