In [None]:
%matplotlib inline

import json
import os
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import dump
from odc.io.cgroups import get_cpu_quota
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, balanced_accuracy_score, f1_score
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_score,
    cross_validate,
)
from sklearn.tree import DecisionTreeClassifier

## Read in training data and label dictionary

### Define the data and label paths

In [None]:
# Training data file from previous step
data_path = "results/training_data_mlfeatureeng.txt"

# Dictionary with class labels from previous step
labels_path = "results/class_labels.json"

### Load the data and identify the feature columns for the model

In [None]:
# load the data
model_input = np.loadtxt(data_path)

# load the column_names
with open(data_path, "r") as file:
    header = file.readline()

# Remove comment symbol from header, then extract label and feature names
column_names = header.split()[1:]

label_col = column_names[0]
feature_cols = column_names[1:]

print(f"Label column:\n{label_col}\n")
print(f"Feature columns:\n{feature_cols}\n")

# Extract relevant indices from training data
model_col_indices = [column_names.index(var_name) for var_name in column_names[1:]]

# Read the class label dictionary
with open(labels_path, "r") as json_file:
    labels_dict = json.load(json_file)

In [None]:
labels_dict

## Convert model input into sklearn format

In [None]:
# Insert data into a Pandas DataFrame, then split into features and labels
model_input_df = pd.DataFrame(model_input, columns=column_names)
X = model_input_df.drop(label_col, axis=1).values
y = model_input_df[[label_col]].values.ravel()

# Investigate value counts for each class
model_input_df[label_col].value_counts()

## Fit, tune and evaluate multiple models using nested cross-validation

This step allows us to train and tune mutliple models on fixed subsets of our data.

When performing cross validation, data is split into `n` folds. One fold is kept aside as test data, and the rest is used to train a model. This step is repeated until each fold has been used as a test set, having been trained on the other two. From each fold, we get an estimate of the performance, which can be averaged to understand expected performance of a model on unseen data.

Nested cross-validation introduces an additional step. Each set of training data is split into `m` further folds, and one is kept aside as test data specifically for fitting hyperparameters. The best parameters identifed across the `m` folds are then passed to the performance estimation folds.

These steps are shown in the image below, with the larger green folds showing the performance step with `n=3` folds, and the hyperparameter tuning step with `m=4` folds.

<img align="center" src="../../../Supplementary_data/Scalable_machine_learning/nested_CV.png" width="500">


### Get number of cpus available for nested cross-validation

In [None]:
ncpus = round(get_cpu_quota())
print("ncpus = " + str(ncpus))

### Construct the models and their parameter grids for tuning

In [None]:
# Create a list to store models
models = []


# Random forest grid and model
model_name = "RandomForest"

rf_param_grid = {
    "class_weight": ["balanced", None],
    "max_features": ["auto", "log2", None],
    "n_estimators": [200, 300, 400],
    "criterion": ["gini", "entropy"],
}

models.append((model_name, RandomForestClassifier(n_jobs=1), rf_param_grid))

# Ada Boost grid and model
model_name = "AdaBoost"

ab_param_grid = {
    "base_estimator": [DecisionTreeClassifier(max_depth=i) for i in [1, 3, 10]],
    "n_estimators": [10, 100, 1000],
    "learning_rate": [0.01, 0.1, 1],
}

models.append((model_name, AdaBoostClassifier(), ab_param_grid))

### Perform nested cross-validation

In [None]:
# Choose performance metrics
metrics = ["balanced_accuracy", "f1_macro"]

In [None]:
# Create empty lists to store outputs
results = {}
test_indices = []

# Only run a single trial for each algorithm, so set a single seed to use for selecting folds
cv_seed = 13
model_seed = 32

# Set number of splits to do
inner_cv_splits = 3
outer_cv_splits = 3

# Number of jobs to pass to the inner cross validation loop
n_jobs_outer = 3
n_jobs_inner = ncpus - n_jobs_outer

for name, model, p_grid in models:
    print(f"Running {name}")

    # Construct the inner cross validation strategy and activity (GridSearchCV)
    inner_cv = StratifiedKFold(
        n_splits=inner_cv_splits, shuffle=True, random_state=cv_seed
    )

    # Store test sets for use with
    for train_idx, test_idx in inner_cv.split(X, y):
        test_indices.append(test_idx)

    clf = GridSearchCV(
        estimator=model,
        param_grid=p_grid,
        scoring="f1_weighted",
        cv=inner_cv,
        n_jobs=n_jobs_inner,
    )

    # Construct the outer cross validation stategy and activity (cross_validate)
    # Request that cross_validate returns the estimator
    outer_cv = StratifiedKFold(
        n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed
    )
    scores_array = cross_validate(
        clf,
        X=X,
        y=y,
        cv=outer_cv,
        scoring=metrics,
        return_estimator=True,
        n_jobs=n_jobs_outer,
    )

    for metric in metrics:
        mean = scores_array[f"test_{metric}"].mean()
        std = scores_array[f"test_{metric}"].std()

        print(f"Average {metric} = {mean:.2f} with std. dev. of {std:.2f}.")
    print("\n")

    # Add the best model and best parameters for each outer split
    results[f"{name}_best_estimators"] = [
        scores_array["estimator"][i].best_estimator_
        for i in range(len(scores_array["estimator"]))
    ]
    results[f"{name}_best_parameters"] = [
        scores_array["estimator"][i].best_params_
        for i in range(len(scores_array["estimator"]))
    ]

## Investigate results

One of the most useful ways to get insight into the performance of machine learning classifiers is to view the confusion matrix, which counts the number of points in each predicted class as a function of the true class.

In [None]:
for name, model, p_grid in models:

    clf_list = results[f"{name}_best_estimators"]
    val_file = f"results/{name}_confusionmat_values_mlfeatureeng.png"
    norm_file = f"results/{name}_confusionmat_normalised_mlfeatureeng.png"

    val_fig, val_ax = plt.subplots(1, inner_cv_splits, figsize=(8 * inner_cv_splits, 7))
    val_fig.suptitle(f"{name} Confusion Matrix (Value)")
    norm_fig, norm_ax = plt.subplots(
        1, inner_cv_splits, figsize=(8 * inner_cv_splits, 7)
    )
    norm_fig.suptitle(f"{name} Confusion Matrix (Row Normalised)", fontsize=16)

    for i in range(inner_cv_splits):
        clf = clf_list[i]
        test_idx = test_indices[i]
        X_test = X[test_idx, :]
        y_test = y[test_idx]

        # Plot unormalised confusion matrix
        val_ax[i].set_title(f"Fold {i}")
        ConfusionMatrixDisplay.from_estimator(
            clf,
            X_test,
            y_test,
            normalize=None,
            ax=val_ax[i],
            colorbar=False,
            display_labels=list(labels_dict.keys()),
            xticks_rotation="vertical",
        )

        val_fig.savefig(val_file, dpi=300, bbox_inches="tight", facecolor="white")

        # Plot normalised confusion matrix
        norm_ax[i].set_title(f"Fold {i}")
        ConfusionMatrixDisplay.from_estimator(
            clf,
            X_test,
            y_test,
            normalize="true",
            ax=norm_ax[i],
            colorbar=False,
            display_labels=list(labels_dict.keys()),
            xticks_rotation="vertical",
        )

        norm_fig.savefig(norm_file, dpi=300, bbox_inches="tight", facecolor="white")

## Hyperparameter tuning with outer folds

After running nested cross-validation, we have selected the best model and understood the performance we can expect to see on new data.

Once the model is decided, we can run hyperparameter tuning using the outer fold only, allowing us to tune the model with additional data.

In [None]:
# Get best estimated params for RF model

outer_cv = StratifiedKFold(n_splits=outer_cv_splits, shuffle=True, random_state=cv_seed)

metric = "f1_macro"

# instatiate a gridsearchCV using outer cross-validation folds
clf = GridSearchCV(
    RandomForestClassifier(n_jobs=1),
    rf_param_grid,
    scoring=metric,
    verbose=1,
    cv=outer_cv.split(X, y),
    n_jobs=ncpus,
)

# Fit the gridsearch on outer cross-validation folds
clf.fit(X, y)

print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print("\n")
print("The " + metric + " score using these parameters is: ")
print(round(clf.best_score_, 2))

## Final model fit

The cross-validation steps have allowed us to pick the best performing model on unseen data and further tune that model. The final step is to fit the model to all of the data, using the best parameters.

In [None]:
# create a new model
new_model = RandomForestClassifier(**clf.best_params_, random_state=1, n_jobs=ncpus)
new_model.fit(X, y)

# Create results directory if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

# Export the final model for use in following notebooks
dump(new_model, "results/randomforest_model.joblib")