# Machine Learning Pipeline and Ablation

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

In [40]:
X_train_df = pd.read_pickle('generated/data_pkl/X_train.pkl')
X_test_df = pd.read_pickle('generated/data_pkl/X_test.pkl')
y_train = pd.read_pickle('generated/data_pkl/y_train.pkl')
y_test = pd.read_pickle('generated/data_pkl/y_test.pkl')

block_cv = pd.read_pickle('generated/data_pkl/block_cv.pkl')

### Feature selection

Used for ablation studies

Select which features to keep when training the model below before running the rest of the pipeline.

In [41]:
X_train_df.columns

Index(['protected', 'qflag', 'lc_0.0', 'lc_1.0', 'lc_2.0', 'lc_12.0',
       'lc_15.0', 'lc_16.0', 'lc_18.0', 'lc_20.0', 'lc_21.0', 'lc_23.0',
       'lc_24.0', 'lc_25.0', 'lc_26.0', 'lc_27.0', 'lc_29.0', 'lc_30.0',
       'lc_31.0', 'lc_32.0', 'lc_34.0', 'slope', 'dem', 'emi', 'lst', 'ndvi',
       'evi', 'vi', 'ampl', 'minv', 'maxv', 'rslope', 'lslope', 'dist_2',
       'dist_3', 'dist_4', 'fire_count'],
      dtype='object')

In [44]:
X_train_df.describe()

Unnamed: 0,protected,qflag,lc_0.0,lc_1.0,lc_2.0,lc_12.0,lc_15.0,lc_16.0,lc_18.0,lc_20.0,...,vi,ampl,minv,maxv,rslope,lslope,dist_2,dist_3,dist_4,fire_count
count,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,...,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0
mean,0.003313096,9.169856,0.0,0.0,0.0,0.1696184,0.0007364514,0.0001785053,0.1478192,0.1509959,...,0.1306744,0.5156181,0.05518547,0.5647145,0.2803369,0.331104,0.4068858,0.4520499,0.4083157,0.007205167
std,0.0574641,1.486245,0.0,0.0,0.0,0.3752974,0.02712766,0.0133594,0.3549209,0.358045,...,0.2114534,0.2454077,0.06282085,0.2554495,0.08060924,0.1389069,0.2086991,0.2203635,0.2082007,0.05423613
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0410177,0.3491,0.009856832,0.3988667,0.2288089,0.2147413,0.2473209,0.274563,0.2491442,0.0
50%,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0410951,0.5339667,0.03211903,0.5895667,0.2598338,0.3075833,0.3935011,0.4417518,0.3949738,0.0
75%,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06117829,0.6989,0.07934188,0.7572667,0.3063712,0.4273565,0.5342851,0.6251549,0.5354002,0.0
max,1.0,10.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
features = ['protected', 'ndvi', 'slope', 'dem', 'lc_0.0', 'lc_1.0', 'lc_2.0', 'lc_12.0', 'lc_15.0',
       'lc_16.0', 'lc_18.0', 'lc_20.0', 'lc_21.0', 'lc_23.0', 'lc_24.0',
       'lc_25.0', 'lc_26.0', 'lc_27.0', 'lc_29.0', 'lc_30.0', 'lc_31.0',
       'lc_32.0', 'lc_34.0']

In [46]:
# Update train, test files accordingly
X_train = X_train_df[features]
X_test = X_test_df[features]

In [47]:
X_train.describe()

Unnamed: 0,protected,ndvi,slope,dem,lc_0.0,lc_1.0,lc_2.0,lc_12.0,lc_15.0,lc_16.0,...,lc_23.0,lc_24.0,lc_25.0,lc_26.0,lc_27.0,lc_29.0,lc_30.0,lc_31.0,lc_32.0,lc_34.0
count,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,...,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0,1069996.0
mean,0.003313096,0.598956,0.9595248,0.2050936,0.0,0.0,0.0,0.1696184,0.0007364514,0.0001785053,...,0.261186,0.03557303,0.03553097,0.1439482,0.01422529,0.005145814,0.0,0.0,0.0,0.0
std,0.0574641,0.1927802,0.05981521,0.1909766,0.0,0.0,0.0,0.3752974,0.02712766,0.0133594,...,0.4392813,0.1852232,0.1851177,0.3510374,0.1184185,0.07154956,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.571039,0.944,0.07189745,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.6682502,0.988,0.1247447,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.7233219,1.0,0.2858474,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


### Models

##### Scoring

In [69]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, fbeta_score, confusion_matrix
from sklearn.metrics import classification_report

In [75]:
def make_classification_report(y_test, y_predict, threshold = 0, f=None):
    y_pred = y_predict
    if (threshold != 0):
        y_pred = [1 if (x >= threshold) else 0 for x in y_predict]
    return f"""
    Precision: {precision_score(y_test, y_pred)}\n
    Recall: {recall_score(y_test, y_pred)}\n
    F1-score: {f1_score(y_test, y_pred)}\n
    Accuracy: {accuracy_score(y_test, y_pred)}\n
    Classification Report : \n
    {classification_report(y_test, y_pred, target_names=['Not burnt', 'Burnt'])}\n
    """
    # f1-score is the harmonic mean between precision and recall

#### Random Forest

In [71]:
# Setup cross validation
def cv(k=5):
    return ((train, test) for train, test in StratifiedGroupKFold(n_splits=k).split(X_train, y_train, groups=block_cv))

##### Grid Search

In [66]:
cv_data = cv()

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV

# Set the hyperparameters to tune
parameters = {
	'n_estimators': [100, 200, 300],
	'max_depth': [5, 10, 15, 20],
	'class_weight': ['balanced']
}

# Initialize the model
model = RandomForestClassifier()

# Initialize the grid search object
grid_search = GridSearchCV(model, parameters, cv=cv_data, scoring='f1', return_train_score=True, verbose=10)

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Get the best set of hyperparameters
best_params = grid_search.best_params_

**Calibration**  
Probability calibration is the process of adjusting the predicted probabilities output by a machine learning model such that they accurately reflect the true underlying probabilities of the classes. In other words, it involves adjusting the model's predictions so that they are more closely aligned with the actual outcomes of the events being predicted.

There are two main types of probability calibration: Platt scaling and isotonic regression. Platt scaling, also known as sigmoid calibration, involves fitting a sigmoid curve to the predicted probabilities output by the model, in order to adjust them to be more accurate. Isotonic regression involves fitting a step function to the predicted probabilities, such that the probabilities are monotonically increasing.

In [None]:
cv_data = cv()

# Initialize the calibrated model with the best set of hyperparameters
calibrated_model = CalibratedClassifierCV(
	RandomForestClassifier(**best_params),
	cv=cv_data,
)

# Fit the calibrated model to the data
calibrated_model.fit(X_train, y_train)

# Save the results to a txt file
with open('results/results_rf_f1.txt', 'w') as f:
	f.write(f'Best set of hyperparameters: {best_params}\n')
	f.write(f'Train score: {calibrated_model.score(X_train, y_train)}\n')
	f.write(f'Test score: {calibrated_model.score(X_test, y_test)}\n')

In [82]:
best_params_ = {'max_depth': 15, 'n_estimators': 100, 'class_weight': 'balanced'}

In [73]:
cv_data = cv()

# Initialize the calibrated model with the best set of hyperparameters
calibrated_model = CalibratedClassifierCV(
	RandomForestClassifier(**best_params_),
	cv=cv_data,
)

# Fit the calibrated model to the data
calibrated_model.fit(X_train, y_train)

In [74]:
y_predict_train = calibrated_model.predict(X_train)
y_predict_test = calibrated_model.predict(X_test)

In [77]:
# Save the results to a txt file
with open('results/results_rf_f1_.txt', 'w') as f:
	f.write(f'Best set of hyperparameters: {best_params_}\n')
	f.write(f'Train report: {make_classification_report(y_train, y_predict_train)}\n')
	f.write(f'Test report: {make_classification_report(y_test, y_predict_test)}\n')

In [78]:
model = calibrated_model

In [79]:
from sklearn import metrics 
from sklearn.metrics import ConfusionMatrixDisplay

def plot_metrics(y_test, y_pred_proba, y_predict):
    #define metrics
    fpr, tpr, thresholds = metrics.roc_curve(y_test,  y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    lr_precision, lr_recall, _ = metrics.precision_recall_curve(y_test, y_pred_proba)

    f, axs = plt.subplots(1, 2, figsize=(15,5))

    #create ROC curve
    axs[0].plot(fpr, tpr, label="AUC="+str(auc))
    axs[0].plot(lr_precision, lr_recall, label="Precision/Recall")
    axs[0].set_ylabel('True Positive Rate/Precision')
    axs[0].set_xlabel('False Positive Rate/Recall')
    axs[0].legend(loc=4)

    disp = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, ax=axs[1], colorbar=False)
    _ = disp.ax_.set_title("Calibrated Random Forest")
    plt.show()
    return fpr, tpr, thresholds

NameError: name 'plt' is not defined

In [None]:
fpr, tpr, thresholds = plot_metrics(y_test, y_pred_proba, y_predict_test)

In [None]:
# optimal threshold for the classifier 
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)

make_classification_report(y_test, y_pred_proba, optimal_threshold)