## For each of our four classification models, we created an ensemble by **combining the predictions of all 25 taskspecific classifiers**. This resulted in four model-specific ensembles, each integrating information from all handwriting tasks within a single classification framework.

## Let's do this for **Random Forest**

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from ucimlrepo import fetch_ucirepo

# Fetch dataset
darwin = fetch_ucirepo(id=732)

# Data (as pandas dataframes)
X = darwin.data.features
y = darwin.data.targets

# Drop the 'ID' column if it exists
if 'ID' in X.columns:
    X = X.drop(columns=['ID'])

# Ensure y is a Series
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]

# Number of attributes per task and number of tasks
num_attributes_per_task = 18
num_tasks = 25

# Define the number of runs
n_runs = 20

# Initialize lists to store the metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
specificities = []
sensitivities = []

# Convert class labels to integers for processing
class_mapping = {label: idx for idx, label in enumerate(y.unique())}
reverse_class_mapping = {idx: label for label, idx in class_mapping.items()}

# Random Forest parameters
rf_params = {
    'bootstrap': True,
    'criterion': 'gini',
    'max_depth': 2,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 50
}

for run in range(n_runs):
    # Shuffle and split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

    # Ensure there are enough features for the tasks
    if X_train.shape[1] < num_attributes_per_task * num_tasks:
        raise ValueError("Not enough features in X_train for the specified number of tasks and attributes per task.")

    # Store predictions from each task
    task_predictions = []

    # Train a Random Forest classifier for each task
    for i in range(num_tasks):
        # Get feature columns for the current task
        start_index = i * num_attributes_per_task
        end_index = start_index + num_attributes_per_task
        task_columns = X_train.columns[start_index:end_index]

        # Train the Random Forest classifier
        clf = RandomForestClassifier(
            bootstrap=rf_params['bootstrap'],
            criterion=rf_params['criterion'],
            max_depth=rf_params['max_depth'],
            min_samples_leaf=rf_params['min_samples_leaf'],
            min_samples_split=rf_params['min_samples_split'],
            n_estimators=rf_params['n_estimators'],
            random_state=None
        )

        clf.fit(X_train[task_columns], y_train)

        # Predict on the test set and convert to integer labels
        task_pred = clf.predict(X_test[task_columns])
        task_pred_int = np.array([class_mapping[pred] for pred in task_pred])
        task_predictions.append(task_pred_int)

    # Convert task_predictions to a numpy array for easier manipulation
    task_predictions = np.array(task_predictions)

    # Majority voting: for each instance, take the most common prediction across all tasks
    final_predictions_int = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=task_predictions)

    # Convert integer labels back to original class names
    final_predictions = np.array([reverse_class_mapping[pred] for pred in final_predictions_int])

    # Calculate metrics
    accuracies.append(accuracy_score(y_test, final_predictions))
    precisions.append(precision_score(y_test, final_predictions, pos_label=list(class_mapping.keys())[1]))
    recalls.append(recall_score(y_test, final_predictions, pos_label=list(class_mapping.keys())[1]))
    f1_scores.append(f1_score(y_test, final_predictions, pos_label=list(class_mapping.keys())[1]))

    # Compute confusion matrix
    cm = confusion_matrix(y_test, final_predictions, labels=list(class_mapping.keys()))

    # Extract confusion matrix components for the positive class
    TP = cm[1, 1]
    FN = cm[1, 0]
    FP = cm[0, 1]
    TN = cm[0, 0]

    # Calculate sensitivity (True Positive Rate)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    sensitivities.append(sensitivity)

    # Calculate specificity (True Negative Rate)
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    specificities.append(specificity)

# Calculate the mean of the metrics
mean_accuracy = np.mean(accuracies)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)
mean_f1_score = np.mean(f1_scores)
mean_sensitivity = np.mean(sensitivities)
mean_specificity = np.mean(specificities)

# Print the average metrics after 20 runs
print("Average Metrics after 20 runs:")
print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")
print(f"Mean F1 Score: {mean_f1_score:.4f}")
print(f"Mean Sensitivity: {mean_sensitivity:.4f}")
print(f"Mean Specificity: {mean_specificity:.4f}")


Average Metrics after 20 runs:
Mean Accuracy: 0.8729
Mean Precision: 0.8384
Mean Recall: 0.9176
Mean F1 Score: 0.8747
Mean Sensitivity: 0.9176
Mean Specificity: 0.8306
