# Initial imports

In [1]:
import os
from pathlib import Path
import joblib
import numpy as np
from imblearn.metrics import geometric_mean_score

cwd = Path.cwd()
print(cwd)
prepped_data_dir = cwd / "PreppedData"

# If current path does not end in "PreppedData", move into it
if cwd.name != "PreppedData":
    if prepped_data_dir.exists():
        os.chdir(prepped_data_dir)
        print(f"Changed working directory to: {prepped_data_dir}")
    else:
        raise FileNotFoundError(f"'Annotations' directory not found at: {prepped_data_dir}")
else:
    print(f"Already in {cwd.name} directory.")

/home/fs1620/MLBD_2024_25/ResearchProject/LiaDataAnalysis_final/FeatureSelection
Changed working directory to: /home/fs1620/MLBD_2024_25/ResearchProject/LiaDataAnalysis_final/FeatureSelection/PreppedData


# Checks for discrepancies in metric computations

I suspect there are some shenanigans with how `imblearn`'s `geometric_mean_score` computes results with `average='macro'` or `'weighted'`. There are some slight numerical discrepancies as can be seen below. We compare the results given by the theoretical formulae for macro and weighted G-mean:

---

### Macro G-mean

For $K$ classes with per-class recalls (sensitivities) $r_i$:

$$\text{G-mean}_{\text{macro}} = \left( \prod_{i=1}^{K} r_i \right)^{1/K}$$

Each class contributes equally, regardless of its support.

---

### Weighted G-mean

For class $i$ with support (number of true instances) $n_i$:

$$\text{G-mean}_{\text{weighted}} = \left( \prod_{i=1}^{K} r_i^{\, n_i} \right)^{1 / \sum_{i=1}^{K} n_i}$$

This is a **weighted geometric mean**, where classes with more samples have a larger influence on the final value.

We can only guess that the discrepancies arise due to some internal numerical smoothing applied by `imblearn` (at least that's what the [source code suggests](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/imblearn/metrics/_classification.py#L548)).

In [2]:
# --- Multiclass case ---

# --- Load CV results ---
config = 'config2'
model_name = 'lda'
model_results = joblib.load(Path.cwd()/f'ResultsCV_{config}'/f'{model_name}_results_{config}.pkl')
all_y_true, all_y_pred = model_results['all_y_true'], model_results['all_y_pred']
print('CV results loaded.')

# --- Unique classes and counts ---
classes, counts = np.unique(all_y_true, return_counts=True)

print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")
print("\n")

# --- Per-class G-mean using imblearn (smoothed internally) ---
gmean_none = geometric_mean_score(all_y_true, all_y_pred, average=None)
print("--- Per-class Sensitivities (G-mean none) ---")
print(gmean_none)
print("\n")

# --- Macro and weighted G-mean using imblearn ---
gmean_macro = geometric_mean_score(all_y_true, all_y_pred, average='macro')
gmean_weighted = geometric_mean_score(all_y_true, all_y_pred, average='weighted')

# --- Manual calculation matching imblearn ---
# Macro G-mean: geometric mean of per-class sensitivities (from gmean_none)
manual_macro = gmean_none.prod()**(1/len(gmean_none))

# Weighted G-mean: weighted geometric mean using class supports
weights = counts / counts.sum()
manual_weighted = np.prod(gmean_none ** weights)

print("--- G-mean Macro ---")
print("Manual macro G-mean (matches imblearn):", manual_macro)
print("Imbalanced-learn macro G-mean:", gmean_macro)
print("\n")

print("--- G-mean Weighted ---")
print("Manual weighted G-mean (matches imblearn):", manual_weighted)
print("Imbalanced-learn weighted G-mean:", gmean_weighted)

CV results loaded.

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


--- Per-class Sensitivities (G-mean none) ---
[0.86153508 0.84048379 0.91152513]


--- G-mean Macro ---
Manual macro G-mean (matches imblearn): 0.870676824713551
Imbalanced-learn macro G-mean: 0.8741303181152683


--- G-mean Weighted ---
Manual weighted G-mean (matches imblearn): 0.862224378826243
Imbalanced-learn weighted G-mean: 0.8659670467311678


In [3]:
# --- Multiclass case ---

# --- Load CV results ---
config = 'config3'
model_name = 'lda'
model_results = joblib.load(Path.cwd()/f'ResultsCV_{config}'/f'{model_name}_results_{config}.pkl')
all_y_true, all_y_pred = model_results['all_y_true'], model_results['all_y_pred']
print('CV results loaded.')

# --- Unique classes and counts ---
classes, counts = np.unique(all_y_true, return_counts=True)

print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")
print("\n")

# --- Per-class G-mean using imblearn (smoothed internally) ---
gmean_none = geometric_mean_score(all_y_true, all_y_pred, average=None)
print("--- Per-class Sensitivities (G-mean none) ---")
print(gmean_none)
print("\n")

# --- Macro and weighted G-mean using imblearn ---
gmean_macro = geometric_mean_score(all_y_true, all_y_pred, average='macro')
gmean_weighted = geometric_mean_score(all_y_true, all_y_pred, average='weighted')

# --- Manual calculation matching imblearn ---
# Macro G-mean: geometric mean of per-class sensitivities (from gmean_none)
manual_macro = gmean_none.prod()**(1/len(gmean_none))

# Weighted G-mean: weighted geometric mean using class supports
weights = counts / counts.sum()
manual_weighted = np.prod(gmean_none ** weights)

print("--- G-mean Macro ---")
print("Manual macro G-mean (matches imblearn):", manual_macro)
print("Imbalanced-learn macro G-mean:", gmean_macro)
print("\n")

print("--- G-mean Weighted ---")
print("Manual weighted G-mean (matches imblearn):", manual_weighted)
print("Imbalanced-learn weighted G-mean:", gmean_weighted)

CV results loaded.

Class supports (number of true instances per class):
Class 0: 1637
Class 1: 895
Class 2: 363
Class 3: 73
Class 4: 97


--- Per-class Sensitivities (G-mean none) ---
[0.86811752 0.85238776 0.9396231  0.94061502 0.94032714]


--- G-mean Macro ---
Manual macro G-mean (matches imblearn): 0.9073441789132661
Imbalanced-learn macro G-mean: 0.9091047626813133


--- G-mean Weighted ---
Manual weighted G-mean (matches imblearn): 0.875504965616811
Imbalanced-learn weighted G-mean: 0.8777141804894034


In [4]:
# --- Binary case ---

# --- Load CV results ---
config = 'config1'
model_name = 'lda'
model_results = joblib.load(Path.cwd()/f'ResultsCV_{config}'/f'{model_name}_results_{config}.pkl')
all_y_true, all_y_pred = model_results['all_y_true'], model_results['all_y_pred']
print('CV results loaded.')

# --- Unique classes and counts ---
classes, counts = np.unique(all_y_true, return_counts=True)

print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")
print("\n")

# --- Per-class G-mean using imblearn (smoothed internally) ---
gmean_none = geometric_mean_score(all_y_true, all_y_pred, average=None)
print("--- Per-class Sensitivities (G-mean none) ---")
print(gmean_none)
print("\n")

# --- Macro and weighted G-mean using imblearn ---
gmean_macro = geometric_mean_score(all_y_true, all_y_pred, average='macro')
gmean_weighted = geometric_mean_score(all_y_true, all_y_pred, average='weighted')

# --- Manual calculation matching imblearn ---
# Macro G-mean: geometric mean of per-class sensitivities (from gmean_none)
manual_macro = gmean_none.prod()**(1/len(gmean_none))

# Weighted G-mean: weighted geometric mean using class supports
weights = counts / counts.sum()
manual_weighted = np.prod(gmean_none ** weights)

print("--- G-mean Macro ---")
print("Manual macro G-mean (matches imblearn):", manual_macro)
print("Imbalanced-learn macro G-mean:", gmean_macro)
print("\n")

print("--- G-mean Weighted ---")
print("Manual weighted G-mean (matches imblearn):", manual_weighted)
print("Imbalanced-learn weighted G-mean:", gmean_weighted)

CV results loaded.

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 1331


--- Per-class Sensitivities (G-mean none) ---
[0.86436415 0.86436415]


--- G-mean Macro ---
Manual macro G-mean (matches imblearn): 0.8643641454747898
Imbalanced-learn macro G-mean: 0.8652696284241367


--- G-mean Weighted ---
Manual weighted G-mean (matches imblearn): 0.8643641454747899
Imbalanced-learn weighted G-mean: 0.865253982326384


In [3]:
from sklearn.metrics import recall_score

# --- Per-class sensitivity (recall) ---
per_class_sens = recall_score(all_y_true, all_y_pred, average=None)
print("-"*3 + " Per-class Sensitivity (Recall) " + "-"*3)
for i, sens in enumerate(per_class_sens):
    print(f"Class {i}: {sens:.4f}")
print("\n")

# --- Macro sensitivity (unweighted mean of per-class recalls) ---
macro_sens_sk =  recall_score(all_y_true, all_y_pred, average='macro')
macro_sens = per_class_sens.mean()
print("-"*3 + " Macro Sensitivity " + "-"*3)
print("Macro sensitivity from sklearn:", macro_sens_sk)
print("Macro sensitivity (average of per-class recalls):", macro_sens)
print("\n")

# --- Weighted sensitivity (weighted mean by class support) ---
classes, counts = np.unique(all_y_true, return_counts=True)
weighted_sens = np.sum(per_class_sens * counts) / counts.sum()
weighted_sens_sk =  recall_score(all_y_true, all_y_pred, average='weighted')
print("-"*3 + " Weighted Sensitivity " + "-"*3)
print("Weighted sensitivity:", weighted_sens)
print("Weighted sensitivity:", weighted_sens_sk)

# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")

--- Per-class Sensitivity (Recall) ---
Class 0: 0.9268
Class 1: 0.7441
Class 2: 0.8463


--- Macro Sensitivity ---
Macro sensitivity from sklearn: 0.839074430770438
Macro sensitivity (average of per-class recalls): 0.839074430770438


--- Weighted Sensitivity ---
Weighted sensitivity: 0.8619902120717782
Weighted sensitivity: 0.8619902120717782

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


In [4]:
from sklearn.metrics import precision_score

# --- Per-class precision ---
per_class_prec = precision_score(all_y_true, all_y_pred, average=None)
print("-"*3 + " Per-class Precision " + "-"*3)
for i, prec in enumerate(per_class_prec):
    print(f"Class {i}: {prec:.4f}")
print("\n")

# --- Macro precision (unweighted mean of per-class precision) ---
macro_prec_sk = precision_score(all_y_true, all_y_pred, average='macro')
macro_prec = per_class_prec.mean()
print("-"*3 + " Macro Precision " + "-"*3)
print("Macro precision from sklearn:", macro_prec_sk)
print("Macro precision (average of per-class precision):", macro_prec)
print("\n")

# --- Weighted precision (weighted mean by class support) ---
classes, counts = np.unique(all_y_true, return_counts=True)
weighted_prec = np.sum(per_class_prec * counts) / counts.sum()
weighted_prec_sk = precision_score(all_y_true, all_y_pred, average='weighted')
print("-"*3 + " Weighted Precision " + "-"*3)
print("Weighted precision (manual):", weighted_prec)
print("Weighted precision from sklearn:", weighted_prec_sk)

# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")


--- Per-class Precision ---
Class 0: 0.8584
Class 1: 0.8582
Class 2: 0.8849


--- Macro Precision ---
Macro precision from sklearn: 0.8671932266505061
Macro precision (average of per-class precision): 0.8671932266505061


--- Weighted Precision ---
Weighted precision (manual): 0.8621467045186014
Weighted precision from sklearn: 0.8621467045186014

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


In [5]:
import numpy as np
from sklearn.metrics import confusion_matrix

# --- Compute per-class specificity ---
classes = np.unique(all_y_true)
per_class_spec = []

cm = confusion_matrix(all_y_true, all_y_pred, labels=classes)

for i, cls in enumerate(classes):
    TP = cm[i, i]
    FP = cm[:, i].sum() - TP
    FN = cm[i, :].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    spec = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    per_class_spec.append(spec)

per_class_spec = np.array(per_class_spec)

print("-"*3 + " Per-class Specificity " + "-"*3)
for cls, spec in zip(classes, per_class_spec):
    print(f"Class {cls}: {spec:.4f}")
print("\n")

# --- Macro specificity (unweighted mean of per-class specificity) ---
macro_spec = per_class_spec.mean()
print("-"*3 + " Macro Specificity " + "-"*3)
print("Macro specificity (average of per-class specificity):", macro_spec)
print("\n")

# --- Weighted specificity (weighted by class support) ---
classes, counts = np.unique(all_y_true, return_counts=True)
weighted_spec = np.sum(per_class_spec * counts) / counts.sum()
print("-"*3 + " Weighted Specificity " + "-"*3)
print("Weighted specificity:", weighted_spec)

# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")


--- Per-class Specificity ---
Class 0: 0.8009
Class 1: 0.9493
Class 2: 0.9817


--- Macro Specificity ---
Macro specificity (average of per-class specificity): 0.9106508135955235


--- Weighted Specificity ---
Weighted specificity: 0.8699622287147921

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


In [6]:
import numpy as np
from sklearn.metrics import recall_score, balanced_accuracy_score

# --- Per-class recalls (sensitivities) ---
per_class_recalls = recall_score(all_y_true, all_y_pred, average=None)
print("-"*3 + " Per-class Recalls (Sensitivity) " + "-"*3)
for i, r in enumerate(per_class_recalls):
    print(f"Class {i}: {r:.4f}")
print("\n")

# --- Macro balanced accuracy (average of per-class recalls) ---
macro_bal_acc_theoretical = per_class_recalls.mean()
macro_bal_acc_sklearn = balanced_accuracy_score(all_y_true, all_y_pred, adjusted=False)
print("-"*3 + " Macro Balanced Accuracy " + "-"*3)
print("Theoretical macro balanced accuracy:", macro_bal_acc_theoretical)
print("Sklearn balanced_accuracy_score (macro):", macro_bal_acc_sklearn)
print("\n")

# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")


--- Per-class Recalls (Sensitivity) ---
Class 0: 0.9268
Class 1: 0.7441
Class 2: 0.8463


--- Macro Balanced Accuracy ---
Theoretical macro balanced accuracy: 0.839074430770438
Sklearn balanced_accuracy_score (macro): 0.839074430770438



Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


In [7]:
from sklearn.metrics import f1_score

# --- Per-class F1-score ---
per_class_f1 = f1_score(all_y_true, all_y_pred, average=None)
print("-"*3 + " Per-class F1-score " + "-"*3)
for i, f1 in enumerate(per_class_f1):
    print(f"Class {i}: {f1:.4f}")
print("\n")

# --- Macro F1-score (unweighted mean of per-class F1) ---
macro_f1_sk = f1_score(all_y_true, all_y_pred, average='macro')
macro_f1 = per_class_f1.mean()
print("-"*3 + " Macro F1-score " + "-"*3)
print("Macro F1-score from sklearn:", macro_f1_sk)
print("Macro F1-score (average of per-class F1):", macro_f1)
print("\n")

# --- Weighted F1-score (weighted mean by class support) ---
classes, counts = np.unique(all_y_true, return_counts=True)
weighted_f1 = np.sum(per_class_f1 * counts) / counts.sum()
weighted_f1_sk = f1_score(all_y_true, all_y_pred, average='weighted')
print("-"*3 + " Weighted F1-score " + "-"*3)
print("Weighted F1-score (manual):", weighted_f1)
print("Weighted F1-score from sklearn:", weighted_f1_sk)

# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")


--- Per-class F1-score ---
Class 0: 0.8913
Class 1: 0.7971
Class 2: 0.8652


--- Macro F1-score ---
Macro F1-score from sklearn: 0.8512004902715056
Macro F1-score (average of per-class F1): 0.8512004902715056


--- Weighted F1-score ---
Weighted F1-score (manual): 0.860081286325489
Weighted F1-score from sklearn: 0.860081286325489

Class supports (number of true instances per class):
Class 0: 1734
Class 1: 895
Class 2: 436


This proves that there are some discrepancies with how the geometric mean is computed by `imblearn`, compared to what the theoretical formula would suggest. We have verified that this is only the case in the multiclass case.

In [8]:
# --- Load CV results ---
config = 'config1'
model_name = 'lda'
lda_results = joblib.load(Path.cwd()/f'ResultsCV_{config}'/f'{model_name}_results_{config}.pkl')
all_y_true, all_y_pred = lda_results['all_y_true'], lda_results['all_y_pred']
print('CV results loaded.')

CV results loaded.


In [9]:
import numpy as np
import joblib
from pathlib import Path
from imblearn.metrics import geometric_mean_score

classes, counts = np.unique(all_y_true, return_counts=True)
# --- Optional: per-class support counts ---
print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")

print("\n")

# --- Per-class G-mean (no averaging) ---
gmean_none = geometric_mean_score(all_y_true, all_y_pred, average=None)
print("-"*3+" Per-class Sensitivities "+"-"*3)
print("G-mean with average = 'none' (per-class sensitivities):", gmean_none)
print("\n")

# --- Macro and weighted G-mean using imbalanced-learn ---
gmean_macro = geometric_mean_score(all_y_true, all_y_pred, average='macro')
gmean_weighted = geometric_mean_score(all_y_true, all_y_pred, average='weighted')

# --- Manual calculation of theoretical macro G-mean ---
theoretical_macro = gmean_none.prod()**(1/len(gmean_none))

# --- Manual calculation of theoretical weighted G-mean ---
weighted_exponent = counts / counts.sum()
theoretical_weighted = np.prod(gmean_none ** weighted_exponent)

print("-"*3+" G-mean Macro "+"-"*3)
print("Theoretical macro G-mean using per-class recalls:", theoretical_macro)
print("Imbalanced-learn's macro G-mean:", gmean_macro)
print("\n")

print("-"*3+" G-mean weighted "+"-"*3)
print("Theoretical weighted G-mean using per-class recalls and supports:", theoretical_weighted)
print("Imbalanced-learn's weighted G-mean:", gmean_weighted)


Class supports (number of true instances per class):
Class 0: 1734
Class 1: 1331


--- Per-class Sensitivities ---
G-mean with average = 'none' (per-class sensitivities): [0.86436415 0.86436415]


--- G-mean Macro ---
Theoretical macro G-mean using per-class recalls: 0.8643641454747898
Imbalanced-learn's macro G-mean: 0.8652696284241367


--- G-mean weighted ---
Theoretical weighted G-mean using per-class recalls and supports: 0.8643641454747899
Imbalanced-learn's weighted G-mean: 0.865253982326384


In [10]:
import numpy as np
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix

# --- Unique classes and counts ---
classes, counts = np.unique(all_y_true, return_counts=True)

print("\nClass supports (number of true instances per class):")
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}")
print("\n")

# --- Per-class G-mean using imblearn ---
gmean_none = geometric_mean_score(all_y_true, all_y_pred, average=None)
print("--- Per-class Sensitivities ---")
print("G-mean with average='none' (per-class sensitivities):", gmean_none)
print("\n")

# --- Macro and weighted G-mean using imblearn ---
gmean_macro = geometric_mean_score(all_y_true, all_y_pred, average='macro')
gmean_weighted = geometric_mean_score(all_y_true, all_y_pred, average='weighted')

# --- Manual theoretical calculation using per-class recalls from imblearn ---
# Macro G-mean (geometric mean of per-class recalls)
theoretical_macro = gmean_none.prod()**(1/len(gmean_none))

# Weighted G-mean (weighted geometric mean using support counts)
weighted_exponent = counts / counts.sum()
theoretical_weighted = np.prod(gmean_none ** weighted_exponent)

print("--- G-mean Macro ---")
print("Theoretical macro G-mean using per-class recalls:", theoretical_macro)
print("Imbalanced-learn's macro G-mean:", gmean_macro)
print("\n")

print("--- G-mean Weighted ---")
print("Theoretical weighted G-mean using per-class recalls and supports:", theoretical_weighted)
print("Imbalanced-learn's weighted G-mean:", gmean_weighted)
print("\n")

# --- Exact reproduction using confusion matrix (matches imblearn exactly) ---
cm = confusion_matrix(all_y_true, all_y_pred, labels=classes)
per_class_recall = np.array([cm[i,i]/cm[i].sum() if cm[i].sum() > 0 else np.nan 
                             for i in range(len(classes))])

# Macro G-mean
macro_gmean_exact = np.nanprod(per_class_recall)**(1/np.sum(~np.isnan(per_class_recall)))
# Weighted G-mean
weighted_gmean_exact = np.prod(per_class_recall ** (counts/counts.sum()))

print("--- Exact reproduction using confusion matrix ---")
print("Per-class recalls:", per_class_recall)
print("Macro G-mean (exact):", macro_gmean_exact)
print("Weighted G-mean (exact):", weighted_gmean_exact)



Class supports (number of true instances per class):
Class 0: 1734
Class 1: 1331


--- Per-class Sensitivities ---
G-mean with average='none' (per-class sensitivities): [0.86436415 0.86436415]


--- G-mean Macro ---
Theoretical macro G-mean using per-class recalls: 0.8643641454747898
Imbalanced-learn's macro G-mean: 0.8652696284241367


--- G-mean Weighted ---
Theoretical weighted G-mean using per-class recalls and supports: 0.8643641454747899
Imbalanced-learn's weighted G-mean: 0.865253982326384


--- Exact reproduction using confusion matrix ---
Per-class recalls: [0.90484429 0.82569497]
Macro G-mean (exact): 0.8643641454747898
Weighted G-mean (exact): 0.8695814668708433
