- Remove population density

https://github.com/fangshuye98/CA_WildFire_ML/issues/11

In [1]:
# Force garbage collection
import gc
gc.collect()

41

In [2]:
import pandas as pd
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.metrics import precision_recall_curve,auc
import warnings
import pickle
from sklearn.metrics import confusion_matrix

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2


In [11]:
initial_features = ['dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 
       'max_air_temperature', 'max_relative_humidity', 
       'min_air_temperature', 'min_relative_humidity', 'precipitation_amount',
       'specific_humidity', 'surface_downwelling_shortwave_flux_in_air',
       #'wind_from_direction', 
       'wind_speed', 'wind_direction_category', 'SWE',
       #'population_density',
       'LAI', 
       #'pdsi', 
       #'IS_FIRE', 
       #'min_FIRE_SIZE', 'max_FIRE_SIZE', 'Year','fire_attribute', 
       #'veg', 
       'veg_group',
       #'slope_avg', 
       'slope_max',
       'road_density_km_km2',
       'line_density_km_per_cell' 
       ]

In [5]:
def train_model(train_data, features, label_col):
    X_train = train_data[features]
    y_train = train_data[label_col]
    # train the model
    model = xgb.XGBClassifier(eval_metric='logloss', tree_method='hist')
    model.fit(X_train, y_train)
    return model

# define function to calculate precision and recall based on a threshold
def calculate_precision_recall(y_true, y_pred_proba, threshold, print_output=False):
    y_pred = (y_pred_proba > threshold).astype(int)
    confusion = confusion_matrix(y_true, y_pred)
    precision = confusion[1, 1] / (confusion[1, 1] + confusion[0, 1])
    recall = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])
    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    if print_output:
        print(f'Threshold: {threshold:.2f}')
        print(f'Precision: {precision * 100:.2f}%')
        print(f'Recall: {recall * 100:.2f}%')
        print("Confusion Matrix")
        print(pd.DataFrame(confusion, index=['True Neg', 'True Pos'], columns=['Pred Neg', 'Pred Pos']))
    # get TP, TN, FP, FN
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return TP, TN, FP, FN, precision, recall, f1

def evaluate_model(model, test_data, features, label_col):
    X_test = test_data[features]
    y_test = test_data[label_col]
    # predict the probability of fire
    y_pred = model.predict_proba(X_test)[:, 1]
    # calculate the roc_auc_score
    roc_auc = roc_auc_score(y_test, y_pred)
    # print roc_auc in a sentence
    # print(f"ROC AUC: {roc_auc:.2f}")
    # Calculate precision and recall values
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    # Calculate the area under the precision-recall curve
    auc_pr = auc(recall, precision)
    # print(f"Area Under Precision-Recall Curve (AUC-PR): {auc_pr:.2f}")
    # calculate precision and recall at thresholds 0.5
    TP, TN, FP, FN, precision5, recall5, f15 = calculate_precision_recall(y_test, y_pred, 0.5)
    return roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15

In [6]:
def get_water_year_range(target_year, num_years=6):
    min_year = target_year - num_years - 1
    min_day = f"{min_year}-10-01 00:00:00"
    max_day = f"{target_year-1}-09-30 00:00:00"
    return min_day, max_day

# Example: Get range for Water Year 2007
target_year = 2007
min_day, max_day = get_water_year_range(target_year)

print(f"Predict Water Years {target_year} using training data: {min_day} ~ {max_day}")

Predict Water Years 2007 using training data: 2000-10-01 00:00:00 ~ 2006-09-30 00:00:00


In [8]:
model_version = "Extended_Data_Water_Year_no_riparian_group_veg"

In [12]:
results = []
log_messages = []
log_messages.append("Model Version: Regrouping veg and remove riparian and remove population density")
# add log to record the current time
log_messages.append(f"Start time: {pd.Timestamp.now()}")
# Define the range of years to predict
years = range(2001, 2021)


# Plot
model_path = f'../../Model/{model_version}'
if not os.path.exists(model_path):
    os.makedirs(model_path)

save_predictions_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/parquet'
if not os.path.exists(save_predictions_path):
    os.makedirs(save_predictions_path)  

# surpass the warning
warnings.filterwarnings("ignore")

mod_Human = pd.read_parquet(f'../../Clean_Data/Model_Data/Downsample/Features_w_Label/features_w_label_downsample_1994_2020_no_riparian_group_veg.parquet')

input_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label/{model_version}'
# Iterate over the years with a progress bar
for year in tqdm(years, desc="Processing years"):
    log_messages.append("-" * 50)
    # log current water year
    log_messages.append(f"Processing Water Year: {year}")
    # read eval data, prev-year Oct - current year Sep
    Eval_Human = pd.read_parquet(f'{input_path}/{year}_features_w_label.parquet')

    # get water year range
    min_day, max_day = get_water_year_range(year)
    # filter the training data
    train_data = mod_Human[(mod_Human['day'] >= min_day) & (mod_Human['day'] <= max_day)]
    # use log message to show the min and max of day from mod_Human
    log_messages.append(f"Training Data Day min: {train_data['day'].min()}, max: {train_data['day'].max()}")
    #mod_Human = mod_Human[features_to_keep]
    #Eval_Human = Eval_Human[features_to_keep]

    cat_columns = ['wind_direction_category','veg_group']

    # one hot encoding
    train_data = pd.get_dummies(train_data, columns=cat_columns)
    Eval_Human = pd.get_dummies(Eval_Human, columns=cat_columns)

    # extract column names starting with 'wind_direction_category_' and 'veg_'
    wind_direction_category_cols = [col for col in train_data.columns if col.startswith('wind_direction_category_')]
    veg_cols = [col for col in train_data.columns if col.startswith('veg_group_') and col != 'veg_type_details']

    features = initial_features + wind_direction_category_cols + veg_cols
    # drop cat_columns from features
    features = [col for col in features if col not in cat_columns]

    label_col = 'IS_FIRE'
    model = train_model(train_data, features, label_col)
    # save model to a pickle file
    with open(f'{model_path}/predict_{year}_6yr_model.pkl', 'wb') as f:
         pickle.dump(model, f)
    # save model to ../../Model/predict_year
    # model.save_model(f'../../Model/predict_{year}_6yr_model.json')
    # evaluate the model
    roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15 = evaluate_model(model, Eval_Human, features, label_col)
    # append the results to the list
    results.append([year, roc_auc, auc_pr, TP, TN, FP, FN, precision5, recall5, f15])

    # add predictions to Eval_Human
    Eval_Human['predictions'] = model.predict_proba(Eval_Human[features])[:, 1]
    # save the predictions to a parquet file
    Eval_Human.to_parquet(f'{save_predictions_path}/{year}_predictions.parquet', index=False)

    # clean up the dataframes
    del train_data
    del Eval_Human

    # clean the cache
    gc.collect()

Processing years: 100%|██████████| 20/20 [07:51<00:00, 23.59s/it]


In [13]:
# Save the log messages to a log file
with open('../../Logs/Clean_Extended_Data/model_training_6_years_no_riparian_no_pop_group_veg.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

In [14]:
# assign column names to the results
results_pd = pd.DataFrame(results, columns=['Year', 'ROC_AUC', 'AUC_PR', 'TP', 'TN', 'FP', 'FN', 'Precision_0.5', 'Recall_0.5', 'F1_0.5'])

In [15]:
results_pd

Unnamed: 0,Year,ROC_AUC,AUC_PR,TP,TN,FP,FN,Precision_0.5,Recall_0.5,F1_0.5
0,2001,0.855101,0.020185,103,4627887,1142,3573,0.082731,0.02802,0.041861
1,2002,0.84962,0.024006,100,4629129,768,3536,0.115207,0.027503,0.044405
2,2003,0.860118,0.021816,79,4632315,490,3176,0.13884,0.02427,0.041318
3,2004,0.850818,0.019338,89,4642791,552,3544,0.138846,0.024498,0.041647
4,2005,0.879241,0.02352,74,4629490,286,3058,0.205556,0.023627,0.042383
5,2006,0.870346,0.024821,86,4622516,308,3777,0.218274,0.022262,0.040404
6,2007,0.854619,0.03088,102,4590916,295,4779,0.256927,0.020897,0.038651
7,2008,0.843505,0.022215,107,4600419,509,4107,0.173701,0.025392,0.044306
8,2009,0.86531,0.021311,94,4608318,662,3069,0.124339,0.029719,0.047971
9,2010,0.871431,0.013665,51,4619517,541,2581,0.086149,0.019377,0.031638


In [16]:
# del all variables to free up memory
del mod_Human
del results
del results_pd
# clean the cache
gc.collect()

0

In [17]:
# delete all data
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

In [18]:
import os
import pandas as pd
from tqdm import tqdm
import gc

In [21]:
model_version = "Extended_Data_Water_Year_no_riparian_group_veg"

In [22]:
# save all parquet to csv
years = range(2001, 2021)

input_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/parquet'
output_path = f'../../Clean_Data/Model_Data/Evaluation/Features_w_Label_w_pred/{model_version}/csv'

if not os.path.exists(output_path):
    os.makedirs(output_path)

for year in tqdm(years, desc="Processing years"):
    # read the parquet file
    df = pd.read_parquet(f'{input_path}/{year}_predictions.parquet')  
    # write to csv
    df.to_csv(f'{output_path}/{year}_predictions.csv', index=False)

    # clean up the dataframes
    del df
    # clean the cache
    gc.collect()

Processing years: 100%|██████████| 20/20 [58:11<00:00, 174.56s/it] 
