In [None]:
%matplotlib inline

import json
import os
from collections import Counter
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import dump
from odc.io.cgroups import get_cpu_quota
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, balanced_accuracy_score, f1_score
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_score,
    cross_validate,
)
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

## Read in training data and label dictionary

### Define the data and label paths

In [None]:
# Training data file from previous step
data_path = "results/training_data_multipixel.txt"

# Dictionary with class labels from previous step
labels_path = "results/class_labels.json"

### Load the data and identify the feature columns for the model

In [None]:
# load the data
model_input = np.loadtxt(data_path)

# load the column_names
with open(data_path, "r") as file:
    header = file.readline()

# Remove comment symbol from header, then extract label and feature names
column_names = header.split()[1:]

label_col = column_names[0]
feature_cols = column_names[1:]

print(f"Label column:\n{label_col}\n")
print(f"Feature columns:\n{feature_cols}\n")

# Extract relevant indices from training data
model_col_indices = [column_names.index(var_name) for var_name in column_names[1:]]

# Read the class label dictionary
with open(labels_path, "r") as json_file:
    labels_dict = json.load(json_file)

In [None]:
labels_dict

## Configure settings for model experiment

In [None]:
experiment_name = "exp_multipixel_allfeatures_removecorrfeaturesgt0p9"

# Decide which features to use
use_all_features = True

if use_all_features:
    columns_to_use = feature_cols  # everything
# Comment out any non-desired features
else:
    columns_to_use = [
        "blue_s2_Q3_2021",
        "green_s2_Q3_2021",
        "red_s2_Q3_2021",
        "nir_s2_Q3_2021",
        "swir_1_s2_Q3_2021",
        "swir_2_s2_Q3_2021",
        "red_edge_1_s2_Q3_2021",
        "red_edge_2_s2_Q3_2021",
        "red_edge_3_s2_Q3_2021",
        "NDVI_s2_Q3_2021",
        "LAI_s2_Q3_2021",
        "SAVI_s2_Q3_2021",
        "MSAVI_s2_Q3_2021",
        "MNDWI_s2_Q3_2021",
        "blue_s2_Q4_2021",
        "green_s2_Q4_2021",
        "red_s2_Q4_2021",
        "nir_s2_Q4_2021",
        "swir_1_s2_Q4_2021",
        "swir_2_s2_Q4_2021",
        "red_edge_1_s2_Q4_2021",
        "red_edge_2_s2_Q4_2021",
        "red_edge_3_s2_Q4_2021",
        "NDVI_s2_Q4_2021",
        "LAI_s2_Q4_2021",
        "SAVI_s2_Q4_2021",
        "MSAVI_s2_Q4_2021",
        "MNDWI_s2_Q4_2021",
        "blue_s2_Q1_2022",
        "green_s2_Q1_2022",
        "red_s2_Q1_2022",
        "nir_s2_Q1_2022",
        "swir_1_s2_Q1_2022",
        "swir_2_s2_Q1_2022",
        "red_edge_1_s2_Q1_2022",
        "red_edge_2_s2_Q1_2022",
        "red_edge_3_s2_Q1_2022",
        "NDVI_s2_Q1_2022",
        "LAI_s2_Q1_2022",
        "SAVI_s2_Q1_2022",
        "MSAVI_s2_Q1_2022",
        "MNDWI_s2_Q1_2022",
        "blue_s2_annual_2021",
        "green_s2_annual_2021",
        "red_s2_annual_2021",
        "nir_s2_annual_2021",
        "swir_1_s2_annual_2021",
        "swir_2_s2_annual_2021",
        "red_edge_1_s2_annual_2021",
        "red_edge_2_s2_annual_2021",
        "red_edge_3_s2_annual_2021",
        "smad_s2_annual_2021",
        "emad_s2_annual_2021",
        "bcmad_s2_annual_2021",
        "NDVI_s2_annual_2021",
        "LAI_s2_annual_2021",
        "SAVI_s2_annual_2021",
        "MSAVI_s2_annual_2021",
        "MNDWI_s2_annual_2021",
        "blue_s2_semiannual_2021_01",
        "green_s2_semiannual_2021_01",
        "red_s2_semiannual_2021_01",
        "nir_s2_semiannual_2021_01",
        "swir_1_s2_semiannual_2021_01",
        "swir_2_s2_semiannual_2021_01",
        "red_edge_1_s2_semiannual_2021_01",
        "red_edge_2_s2_semiannual_2021_01",
        "red_edge_3_s2_semiannual_2021_01",
        "smad_s2_semiannual_2021_01",
        "emad_s2_semiannual_2021_01",
        "bcmad_s2_semiannual_2021_01",
        "NDVI_s2_semiannual_2021_01",
        "LAI_s2_semiannual_2021_01",
        "SAVI_s2_semiannual_2021_01",
        "MSAVI_s2_semiannual_2021_01",
        "MNDWI_s2_semiannual_2021_01",
        "blue_s2_semiannual_2021_06",
        "green_s2_semiannual_2021_06",
        "red_s2_semiannual_2021_06",
        "nir_s2_semiannual_2021_06",
        "swir_1_s2_semiannual_2021_06",
        "swir_2_s2_semiannual_2021_06",
        "red_edge_1_s2_semiannual_2021_06",
        "red_edge_2_s2_semiannual_2021_06",
        "red_edge_3_s2_semiannual_2021_06",
        "smad_s2_semiannual_2021_06",
        "emad_s2_semiannual_2021_06",
        "bcmad_s2_semiannual_2021_06",
        "NDVI_s2_semiannual_2021_06",
        "LAI_s2_semiannual_2021_06",
        "SAVI_s2_semiannual_2021_06",
        "MSAVI_s2_semiannual_2021_06",
        "MNDWI_s2_semiannual_2021_06",
        "vv_s1_xrgm_Q3_2021",
        "vh_s1_xrgm_Q3_2021",
        "vv_s1_xrgm_Q4_2021",
        "vh_s1_xrgm_Q4_2021",
        "bs_median_Q3_2021",
        "pv_median_Q3_2021",
        "npv_median_Q3_2021",
        "ue_median_Q3_2021",
        "bs_median_Q4_2021",
        "pv_median_Q4_2021",
        "npv_median_Q4_2021",
        "ue_median_Q4_2021",
        "bs_median_Q1_2022",
        "pv_median_Q1_2022",
        "npv_median_Q1_2022",
        "ue_median_Q1_2022",
        "rainfall_mean_Q3_2021",
        "rainfall_mean_Q4_2021",
        "rainfall_mean_Q1_2022",
        "slope",
    ]

# Set flag for removing correlated features (applied to training set to identify, then removed from test set for evaluation)
remove_correlated_features = True
removal_threshold = 0.9

## Convert model input into sklearn format

In [None]:
# Insert data into a Pandas DataFrame, then split into features and labels
model_input_df = pd.DataFrame(model_input, columns=column_names)
X = model_input_df.drop(label_col, axis=1)[columns_to_use].values
y = model_input_df[[label_col]].values.ravel()

# Investigate value counts for each class
model_input_df[label_col].value_counts()

In [None]:
len(model_input_df)

## Fit, tune and evaluate multiple models using nested cross-validation

This step allows us to train and tune mutliple models on fixed subsets of our data.

When performing cross validation, data is split into `n` folds. One fold is kept aside as test data, and the rest is used to train a model. This step is repeated until each fold has been used as a test set, having been trained on the other two. From each fold, we get an estimate of the performance, which can be averaged to understand expected performance of a model on unseen data.

Nested cross-validation introduces an additional step. Each set of training data is split into `m` further folds, and one is kept aside as test data specifically for fitting hyperparameters. The best parameters identifed across the `m` folds are then passed to the performance estimation folds.

These steps are shown in the image below, with the larger green folds showing the performance step with `n=3` folds, and the hyperparameter tuning step with `m=4` folds.

<img align="center" src="../../../Supplementary_data/Scalable_machine_learning/nested_CV.png" width="500">


### Get number of cpus available for nested cross-validation

In [None]:
ncpus = round(get_cpu_quota())
print("ncpus = " + str(ncpus))

### Construct the models and their parameter grids for tuning

In [None]:
# Create a list to store models
models = []


# Random forest grid and model
model_name = "RandomForest"

rf_param_grid = {
    "model__class_weight": ["balanced", None],
    "model__max_features": ["sqrt", "log2", None],
    "model__n_estimators": [200, 300, 400],
    "model__criterion": ["gini", "entropy"],
}

models.append((model_name, RandomForestClassifier(n_jobs=1), rf_param_grid))

# Ada Boost grid and model
model_name = "AdaBoost"

ab_param_grid = {
    "model__base_estimator": [DecisionTreeClassifier(max_depth=i) for i in [1, 3, 10]],
    "model__n_estimators": [10, 100, 1000],
    "model__learning_rate": [0.01, 0.1, 1],
}

models.append((model_name, AdaBoostClassifier(), ab_param_grid))

### Create pipeline elements

In [None]:
## Transformer for removing correlated variables


class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cols, removal_threshold=0.9, to_drop=None):
        self.cols = cols
        self.removal_threshold = removal_threshold
        self.to_drop = to_drop

    def fit(self, X, y=None):
        X_ = X.copy()
        X_ = pd.DataFrame(X_, columns=self.cols)  # X_[self.cols]

        correlation_matrix = X_.corr().abs()
        upper_tri = correlation_matrix.where(
            np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
        )
        to_drop = [
            column
            for column in upper_tri.columns
            if any(upper_tri[column] > self.removal_threshold)
        ]

        self.to_drop = to_drop

        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_ = pd.DataFrame(X_, columns=self.cols)

        X_ = X_.drop(self.to_drop, axis=1).values

        return X_

### Perform nested cross-validation

In [None]:
# Create empty lists to store outputs
results = {}
outer_cv_test_pairs = {}
pipelines = {}

# Only run a single trial for each algorithm, so set a single seed to use for selecting folds
cv_seed = 13
model_seed = 32

# Set number of splits to do
inner_cv_splits = 3
outer_cv_splits = 3

# Number of jobs to pass to the inner cross validation loop
n_jobs_outer = 3
n_jobs_inner = ncpus - n_jobs_outer

for name, model, p_grid in models:
    print(f"Running {name}")

    # Create the pipeline method to leverage
    if remove_correlated_features:
        pipeline = Pipeline(
            steps=[
                (
                    "drop_corr_features",
                    DropCorrelatedFeatures(
                        columns_to_use, removal_threshold=removal_threshold
                    ),
                ),
                ("model", model),
            ]
        )

    else:
        pipeline = Pipeline(
            steps=[
                ("model", model),
            ]
        )

    pipelines[name] = pipeline

    # Create the outer_cv for each model so that the same data is fitted
    outer_cv = StratifiedKFold(
        n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed
    )

    # Create dictionary to store testing arrays for each model
    model_cv_test_pairs = {}
    model_best_estimators = {}

    # Loop over the outer split
    for outer_split_number, (train_index, test_index) in enumerate(
        outer_cv.split(X, y)
    ):
        print(f"running Outer Split {outer_split_number}")

        X_train, X_test = X[train_index, :], X[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        # Create inner cv for each outer cv
        inner_cv = StratifiedKFold(
            n_splits=inner_cv_splits, shuffle=True, random_state=cv_seed
        )

        # Create grid search
        clf = GridSearchCV(
            pipeline,
            param_grid=p_grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=n_jobs_inner,
        )

        print("    fitting inner CV loop")
        # Fit to training data
        clf.fit(X_train, y_train)

        # Calculate prediction
        best_model = clf.best_estimator_
        print("performing prediction")
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        test_f1_macro = f1_score(y_test, y_pred, average="macro")

        # Store the results
        model_best_estimators[f"split_{outer_split_number}"] = {
            "best_estimator": clf.best_estimator_,
            "f1_macro_score": test_f1_macro,
        }

        # Store the true and predicted arrays
        model_cv_test_pairs[f"split_{outer_split_number}"] = (y_test, y_pred)

    # Capture results out
    outer_cv_test_pairs[name] = model_cv_test_pairs
    results[name] = model_best_estimators

### Display average scores over outer folds

In [None]:
for key in results.keys():
    scores = np.array(
        [results[key][split]["f1_macro_score"] for split in results[key].keys()]
    )
    mean = scores.mean()
    std = scores.std()

    print(f"Average F1 Macro for {key} = {mean:.2f} with std. dev. of {std:.2f}.")

## Investigate results

One of the most useful ways to get insight into the performance of machine learning classifiers is to view the confusion matrix, which counts the number of points in each predicted class as a function of the true class.

In [None]:
for name, model, p_grid in models:

    # clf_list = results[f"{name}"]
    val_file = f"results/{experiment_name}_{name}_confusionmat_values.png"
    norm_file = f"results/{experiment_name}_{name}_confusionmat_normalised.png"

    val_fig, val_ax = plt.subplots(1, outer_cv_splits, figsize=(8 * outer_cv_splits, 7))
    val_fig.suptitle(f"{name} Confusion Matrix (Value)")
    norm_fig, norm_ax = plt.subplots(
        1, outer_cv_splits, figsize=(8 * outer_cv_splits, 7)
    )
    norm_fig.suptitle(f"{name} Confusion Matrix (Row Normalised)", fontsize=16)

    for i in range(outer_cv_splits):
        y_true = outer_cv_test_pairs[name][f"split_{i}"][0]
        y_pred = outer_cv_test_pairs[name][f"split_{i}"][1]
        # Plot unormalised confusion matrix
        val_ax[i].set_title(f"Fold {i}")
        ConfusionMatrixDisplay.from_predictions(
            y_true,
            y_pred,
            normalize=None,
            ax=val_ax[i],
            colorbar=False,
            display_labels=list(labels_dict.keys()),
            xticks_rotation="vertical",
        )

        val_fig.savefig(val_file, dpi=300, bbox_inches="tight", facecolor="white")

        # Plot normalised confusion matrix
        norm_ax[i].set_title(f"Fold {i}")
        ConfusionMatrixDisplay.from_predictions(
            y_true,
            y_pred,
            normalize="true",
            ax=norm_ax[i],
            colorbar=False,
            display_labels=list(labels_dict.keys()),
            xticks_rotation="vertical",
        )

        norm_fig.savefig(norm_file, dpi=300, bbox_inches="tight", facecolor="white")

## Hyperparameter tuning with outer folds

After running nested cross-validation, we have selected the best model and understood the performance we can expect to see on new data.

Once the model is decided, we can run hyperparameter tuning using the outer fold only, allowing us to tune the model with additional data.

In [None]:
models

In [None]:
# Get best estimated params for RF model

outer_cv = StratifiedKFold(n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed)

metric = "f1_macro"
name, model, p_grid = models[0]  # Use random forest

# instatiate a gridsearchCV using outer cross-validation folds
clf = GridSearchCV(
    pipelines[name],
    p_grid,
    scoring=metric,
    verbose=1,
    cv=outer_cv.split(X, y),
    n_jobs=ncpus,
)

# Fit the gridsearch on outer cross-validation folds
clf.fit(X, y)

print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print("\n")
print("The " + metric + " score using these parameters is: ")
print(round(clf.best_score_, 2))

In [None]:
if remove_correlated_features:
    removed_cols = clf.best_estimator_["drop_corr_features"].to_drop
    remaining_cols = [col for col in columns_to_use if col not in removed_cols]
else:
    remaining_cols = columns_to_use

remaining_cols

## Final model fit

The cross-validation steps have allowed us to pick the best performing model on unseen data and further tune that model. The final step is to fit the model to all of the data, using the best parameters.

In [None]:
# Transform data and fit new model
X_transformed = clf.best_estimator_["drop_corr_features"].transform(X)

new_model = clf.best_estimator_["model"]
new_model.fit(X_transformed, y)

# Create results directory if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

# Export the final model for use in following notebooks
dump(new_model, f"results/{experiment_name}_{name}.joblib")

# Export the columns to use in the final model
with open(
    f"results/{experiment_name}_{name}_features.json", "w", encoding="utf-8"
) as f:
    json.dump({"features": remaining_cols}, f, ensure_ascii=False, indent=4)