# Setup & Load Data:

In [None]:
# ============================================================
# HYPERTHYROID DETECTION - MODEL TRAINING & EVALUATION
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, matthews_corrcoef)
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier

import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*70)
print("HYPERTHYROID DETECTION - MODEL TRAINING & EVALUATION")
print("="*70)

# Load datasets
train_df = pd.read_csv('../data/train_set.csv')
val_df = pd.read_csv('../data/val_set.csv')
test_df = pd.read_csv('../data/test_set.csv')

print(f"\n‚úÖ Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

# Separate features and target
X_train = train_df.drop('hyperlabel', axis=1)
y_train = train_df['hyperlabel']
X_val = val_df.drop('hyperlabel', axis=1)
y_val = val_df['hyperlabel']
X_test = test_df.drop('hyperlabel', axis=1)
y_test = test_df['hyperlabel']

# Remove non-numeric columns
object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
if len(object_cols) > 0:
    print(f"\nüîß Removing {len(object_cols)} non-numeric columns: {object_cols}")
    X_train = X_train.drop(columns=object_cols)
    X_val = X_val.drop(columns=object_cols)
    X_test = X_test.drop(columns=object_cols)

print(f"\nüìä Features: {X_train.shape[1]}, Classes: {y_train.nunique()}")

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Scaling complete!\n" + "="*70)

# Storage for all models
models = {}
predictions = {}
training_times = {}


HYPERTHYROID DETECTION - MODEL TRAINING & EVALUATION

‚úÖ Train: (6342, 27), Val: (1359, 27), Test: (1359, 27)

üìä Features: 26, Classes: 3
‚úÖ Scaling complete!


# Model 1: Random Forest

In [20]:
# ============================================================
# MODEL 1: RANDOM FOREST
# ============================================================
print("\n" + "="*70)
print("üå≤ MODEL 1: RANDOM FOREST")
print("="*70)

start = time.time()

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
training_times['Random Forest'] = time.time() - start

# Predictions
y_val_pred_rf = rf_model.predict(X_val)
y_val_proba_rf = rf_model.predict_proba(X_val)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_rf)
val_f1 = f1_score(y_val, y_val_pred_rf, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Random Forest']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_rf, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Random Forest'] = rf_model
predictions['Random Forest'] = {
    'val': y_val_pred_rf,
    'val_proba': y_val_proba_rf,
    'test': rf_model.predict(X_test),
    'test_proba': rf_model.predict_proba(X_test)
}

print("\n" + "="*70)



üå≤ MODEL 1: RANDOM FOREST

‚úÖ Training Time: 0.14s
üìä Validation Accuracy: 1.0000 (100.00%)
üìä Validation F1-Score: 1.0000

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      1074
 Subclinical       1.00      1.00      1.00        75
       Overt       1.00      1.00      1.00       210

    accuracy                           1.00      1359
   macro avg       1.00      1.00      1.00      1359
weighted avg       1.00      1.00      1.00      1359




# Model 2: Gradient Boosting

In [21]:
# ============================================================
# MODEL 2: GRADIENT BOOSTING
# ============================================================
print("\n" + "="*70)
print("üöÄ MODEL 2: GRADIENT BOOSTING")
print("="*70)

start = time.time()

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)

gb_model.fit(X_train, y_train)
training_times['Gradient Boosting'] = time.time() - start

# Predictions
y_val_pred_gb = gb_model.predict(X_val)
y_val_proba_gb = gb_model.predict_proba(X_val)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_gb)
val_f1 = f1_score(y_val, y_val_pred_gb, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Gradient Boosting']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_gb, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Gradient Boosting'] = gb_model
predictions['Gradient Boosting'] = {
    'val': y_val_pred_gb,
    'val_proba': y_val_proba_gb,
    'test': gb_model.predict(X_test),
    'test_proba': gb_model.predict_proba(X_test)
}

print("\n" + "="*70)



üöÄ MODEL 2: GRADIENT BOOSTING

‚úÖ Training Time: 1.02s
üìä Validation Accuracy: 1.0000 (100.00%)
üìä Validation F1-Score: 1.0000

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      1074
 Subclinical       1.00      1.00      1.00        75
       Overt       1.00      1.00      1.00       210

    accuracy                           1.00      1359
   macro avg       1.00      1.00      1.00      1359
weighted avg       1.00      1.00      1.00      1359




# Model 3: Logistic Regression


In [22]:
# ============================================================
# MODEL 3: LOGISTIC REGRESSION
# ============================================================
print("\n" + "="*70)
print("üìà MODEL 3: LOGISTIC REGRESSION")
print("="*70)

start = time.time()

lr_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train_scaled, y_train)
training_times['Logistic Regression'] = time.time() - start

# Predictions
y_val_pred_lr = lr_model.predict(X_val_scaled)
y_val_proba_lr = lr_model.predict_proba(X_val_scaled)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_lr)
val_f1 = f1_score(y_val, y_val_pred_lr, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Logistic Regression']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_lr, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Logistic Regression'] = lr_model
predictions['Logistic Regression'] = {
    'val': y_val_pred_lr,
    'val_proba': y_val_proba_lr,
    'test': lr_model.predict(X_test_scaled),
    'test_proba': lr_model.predict_proba(X_test_scaled)
}

print("\n" + "="*70)



üìà MODEL 3: LOGISTIC REGRESSION

‚úÖ Training Time: 2.21s
üìä Validation Accuracy: 0.8263 (82.63%)
üìä Validation F1-Score: 0.7177

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      0.80      0.89      1074
 Subclinical       0.32      0.96      0.48        75
       Overt       0.70      0.89      0.78       210

    accuracy                           0.83      1359
   macro avg       0.67      0.88      0.72      1359
weighted avg       0.91      0.83      0.85      1359




# Model 4: SVM

In [23]:
# ============================================================
# MODEL 4: SVM (SUPPORT VECTOR MACHINE) - NEW!
# ============================================================
print("\n" + "="*70)
print("‚ö° MODEL 4: SVM (Support Vector Machine)")
print("="*70)

start = time.time()

svm_model = SVC(
    kernel='rbf',              # Radial Basis Function kernel
    C=10,                      # Regularization parameter
    gamma='scale',             # Kernel coefficient
    class_weight='balanced',   # Handle imbalanced classes
    probability=True,          # Enable probability estimates
    random_state=42
)

svm_model.fit(X_train_scaled, y_train)
training_times['SVM'] = time.time() - start

# Predictions
y_val_pred_svm = svm_model.predict(X_val_scaled)
y_val_proba_svm = svm_model.predict_proba(X_val_scaled)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_svm)
val_f1 = f1_score(y_val, y_val_pred_svm, average='macro')

print(f"\n‚úÖ Training Time: {training_times['SVM']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_svm, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['SVM'] = svm_model
predictions['SVM'] = {
    'val': y_val_pred_svm,
    'val_proba': y_val_proba_svm,
    'test': svm_model.predict(X_test_scaled),
    'test_proba': svm_model.predict_proba(X_test_scaled)
}

print("\n" + "="*70)




‚ö° MODEL 4: SVM (Support Vector Machine)

‚úÖ Training Time: 6.27s
üìä Validation Accuracy: 0.7682 (76.82%)
üìä Validation F1-Score: 0.6520

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       0.96      0.75      0.84      1074
 Subclinical       0.29      0.84      0.43        75
       Overt       0.59      0.83      0.69       210

    accuracy                           0.77      1359
   macro avg       0.61      0.81      0.65      1359
weighted avg       0.87      0.77      0.80      1359




# Model 5: MLP (Neural Network)

In [24]:
# ============================================================
# MODEL 5: MLP (MULTI-LAYER PERCEPTRON)
# ============================================================
print("\n" + "="*70)
print("üß† MODEL 5: MLP (Neural Network)")
print("="*70)

start = time.time()

mlp_model = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),  # 3 hidden layers
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=32,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=200,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)

mlp_model.fit(X_train_scaled, y_train)
training_times['MLP'] = time.time() - start

# Predictions
y_val_pred_mlp = mlp_model.predict(X_val_scaled)
y_val_proba_mlp = mlp_model.predict_proba(X_val_scaled)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_mlp)
val_f1 = f1_score(y_val, y_val_pred_mlp, average='macro')

print(f"\n‚úÖ Training Time: {training_times['MLP']:.2f}s")
print(f"   Iterations: {mlp_model.n_iter_}")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_mlp, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['MLP'] = mlp_model
predictions['MLP'] = {
    'val': y_val_pred_mlp,
    'val_proba': y_val_proba_mlp,
    'test': mlp_model.predict(X_test_scaled),
    'test_proba': mlp_model.predict_proba(X_test_scaled)
}

print("\n" + "="*70)



üß† MODEL 5: MLP (Neural Network)

‚úÖ Training Time: 2.52s
   Iterations: 33
üìä Validation Accuracy: 0.9455 (94.55%)
üìä Validation F1-Score: 0.8727

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       0.96      0.97      0.97      1074
 Subclinical       0.76      0.75      0.75        75
       Overt       0.93      0.87      0.90       210

    accuracy                           0.95      1359
   macro avg       0.88      0.86      0.87      1359
weighted avg       0.95      0.95      0.95      1359




# Model 6: KNN (K-Nearest Neighbors)

In [25]:
# ============================================================
# MODEL 6: KNN (K-NEAREST NEIGHBORS) - NEW!
# ============================================================
print("\n" + "="*70)
print("üë• MODEL 6: KNN (K-Nearest Neighbors)")
print("="*70)

from sklearn.neighbors import KNeighborsClassifier

start = time.time()

knn_model = KNeighborsClassifier(
    n_neighbors=11,            # Number of neighbors
    weights='distance',        # Weight by inverse distance
    metric='minkowski',        # Distance metric
    p=2,                       # p=2 means Euclidean distance
    n_jobs=-1
)

knn_model.fit(X_train_scaled, y_train)
training_times['KNN'] = time.time() - start

# Predictions
y_val_pred_knn = knn_model.predict(X_val_scaled)
y_val_proba_knn = knn_model.predict_proba(X_val_scaled)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_knn)
val_f1 = f1_score(y_val, y_val_pred_knn, average='macro')

print(f"\n‚úÖ Training Time: {training_times['KNN']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_knn, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['KNN'] = knn_model
predictions['KNN'] = {
    'val': y_val_pred_knn,
    'val_proba': y_val_proba_knn,
    'test': knn_model.predict(X_test_scaled),
    'test_proba': knn_model.predict_proba(X_test_scaled)
}

print("\n" + "="*70)



üë• MODEL 6: KNN (K-Nearest Neighbors)

‚úÖ Training Time: 0.00s
üìä Validation Accuracy: 0.8263 (82.63%)
üìä Validation F1-Score: 0.4847

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       0.84      0.97      0.90      1074
 Subclinical       0.33      0.03      0.05        75
       Overt       0.68      0.40      0.50       210

    accuracy                           0.83      1359
   macro avg       0.62      0.46      0.48      1359
weighted avg       0.79      0.83      0.79      1359




# Model 7: Decision Tree

In [27]:
# ============================================================
# MODEL 7: DECISION TREE - NEW!
# ============================================================
print("\n" + "="*70)
print("üå≥ MODEL 7: DECISION TREE")
print("="*70)

from sklearn.tree import DecisionTreeClassifier

start = time.time()

dt_model = DecisionTreeClassifier(
    max_depth=10,              # Maximum depth of tree
    min_samples_split=20,      # Min samples to split node
    min_samples_leaf=10,       # Min samples in leaf
    class_weight='balanced',   # Handle imbalanced classes
    random_state=42
)

dt_model.fit(X_train, y_train)
training_times['Decision Tree'] = time.time() - start

# Predictions
y_val_pred_dt = dt_model.predict(X_val)
y_val_proba_dt = dt_model.predict_proba(X_val)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_dt)
val_f1 = f1_score(y_val, y_val_pred_dt, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Decision Tree']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")
print(f"üå≥ Tree Depth: {dt_model.get_depth()}")
print(f"üçÉ Number of Leaves: {dt_model.get_n_leaves()}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_dt, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Decision Tree'] = dt_model
predictions['Decision Tree'] = {
    'val': y_val_pred_dt,
    'val_proba': y_val_proba_dt,
    'test': dt_model.predict(X_test),
    'test_proba': dt_model.predict_proba(X_test)
}

print("\n" + "="*70)



üå≥ MODEL 7: DECISION TREE

‚úÖ Training Time: 0.00s
üìä Validation Accuracy: 1.0000 (100.00%)
üìä Validation F1-Score: 1.0000
üå≥ Tree Depth: 4
üçÉ Number of Leaves: 6

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      1074
 Subclinical       1.00      1.00      1.00        75
       Overt       1.00      1.00      1.00       210

    accuracy                           1.00      1359
   macro avg       1.00      1.00      1.00      1359
weighted avg       1.00      1.00      1.00      1359




# Model 8: Naive Bayes

In [28]:
# ============================================================
# MODEL 8: NAIVE BAYES (GAUSSIAN) - NEW!
# ============================================================
print("\n" + "="*70)
print("üìä MODEL 8: NAIVE BAYES (Gaussian)")
print("="*70)

from sklearn.naive_bayes import GaussianNB

start = time.time()

nb_model = GaussianNB()

nb_model.fit(X_train_scaled, y_train)
training_times['Naive Bayes'] = time.time() - start

# Predictions
y_val_pred_nb = nb_model.predict(X_val_scaled)
y_val_proba_nb = nb_model.predict_proba(X_val_scaled)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_nb)
val_f1 = f1_score(y_val, y_val_pred_nb, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Naive Bayes']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_nb, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Naive Bayes'] = nb_model
predictions['Naive Bayes'] = {
    'val': y_val_pred_nb,
    'val_proba': y_val_proba_nb,
    'test': nb_model.predict(X_test_scaled),
    'test_proba': nb_model.predict_proba(X_test_scaled)
}

print("\n" + "="*70)



üìä MODEL 8: NAIVE BAYES (Gaussian)

‚úÖ Training Time: 0.00s
üìä Validation Accuracy: 0.8536 (85.36%)
üìä Validation F1-Score: 0.6568

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      0.93      0.96      1074
 Subclinical       0.27      0.89      0.41        75
       Overt       0.86      0.45      0.59       210

    accuracy                           0.85      1359
   macro avg       0.71      0.76      0.66      1359
weighted avg       0.94      0.85      0.87      1359




# Model 9: Extra Trees

In [29]:
# ============================================================
# MODEL 9: EXTRA TREES (Extremely Randomized Trees) - NEW!
# ============================================================
print("\n" + "="*70)
print("üå≤üå≤ MODEL 9: EXTRA TREES")
print("="*70)

from sklearn.ensemble import ExtraTreesClassifier

start = time.time()

et_model = ExtraTreesClassifier(
    n_estimators=100,          # Number of trees
    max_depth=15,              # Maximum depth
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

et_model.fit(X_train, y_train)
training_times['Extra Trees'] = time.time() - start

# Predictions
y_val_pred_et = et_model.predict(X_val)
y_val_proba_et = et_model.predict_proba(X_val)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_et)
val_f1 = f1_score(y_val, y_val_pred_et, average='macro')

print(f"\n‚úÖ Training Time: {training_times['Extra Trees']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_et, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['Extra Trees'] = et_model
predictions['Extra Trees'] = {
    'val': y_val_pred_et,
    'val_proba': y_val_proba_et,
    'test': et_model.predict(X_test),
    'test_proba': et_model.predict_proba(X_test)
}

print("\n" + "="*70)



üå≤üå≤ MODEL 9: EXTRA TREES

‚úÖ Training Time: 0.11s
üìä Validation Accuracy: 0.6652 (66.52%)
üìä Validation F1-Score: 0.5619

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       0.97      0.63      0.76      1074
 Subclinical       0.17      0.79      0.28        75
       Overt       0.54      0.80      0.64       210

    accuracy                           0.67      1359
   macro avg       0.56      0.74      0.56      1359
weighted avg       0.86      0.67      0.72      1359




# Model 10: AdaBoost

In [30]:
# ============================================================
# MODEL 10: ADABOOST (Adaptive Boosting) - NEW!
# ============================================================
print("\n" + "="*70)
print("üéØ MODEL 10: ADABOOST")
print("="*70)

from sklearn.ensemble import AdaBoostClassifier

start = time.time()

ada_model = AdaBoostClassifier(
    n_estimators=100,          # Number of boosting stages
    learning_rate=0.5,         # Learning rate
    random_state=42
)

ada_model.fit(X_train, y_train)
training_times['AdaBoost'] = time.time() - start

# Predictions
y_val_pred_ada = ada_model.predict(X_val)
y_val_proba_ada = ada_model.predict_proba(X_val)

# Quick evaluation
val_acc = accuracy_score(y_val, y_val_pred_ada)
val_f1 = f1_score(y_val, y_val_pred_ada, average='macro')

print(f"\n‚úÖ Training Time: {training_times['AdaBoost']:.2f}s")
print(f"üìä Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"üìä Validation F1-Score: {val_f1:.4f}")

print("\nüìà Classification Report:")
print(classification_report(y_val, y_val_pred_ada, 
                          target_names=['Normal', 'Subclinical', 'Overt']))

# Store
models['AdaBoost'] = ada_model
predictions['AdaBoost'] = {
    'val': y_val_pred_ada,
    'val_proba': y_val_proba_ada,
    'test': ada_model.predict(X_test),
    'test_proba': ada_model.predict_proba(X_test)
}

print("\n" + "="*70)



üéØ MODEL 10: ADABOOST

‚úÖ Training Time: 0.32s
üìä Validation Accuracy: 1.0000 (100.00%)
üìä Validation F1-Score: 1.0000

üìà Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      1074
 Subclinical       1.00      1.00      1.00        75
       Overt       1.00      1.00      1.00       210

    accuracy                           1.00      1359
   macro avg       1.00      1.00      1.00      1359
weighted avg       1.00      1.00      1.00      1359




# COMPREHENSIVE EVALUATION

In [37]:
# ============================================================
# 6. COMPREHENSIVE MODEL EVALUATION
# ============================================================
# Professional evaluation with all metrics: AUROC, F1, MCC, 
# Precision, Recall, Confusion Matrices, and Test Set Validation
# ============================================================

print("\n" + "="*80)
print(" " * 20 + "COMPREHENSIVE MODEL EVALUATION")
print("="*80)

# ============================================================
# PART 1: Calculate All Metrics for Each Model
# ============================================================
print("\nüìä Calculating comprehensive metrics for all models...")

evaluation_results = []

for model_name in models.keys():
    print(f"‚öôÔ∏è Evaluating {model_name}...")
    
    # Get predictions
    y_val_pred = predictions[model_name]['val']
    y_val_proba = predictions[model_name]['val_proba']
    y_test_pred = predictions[model_name]['test']
    y_test_proba = predictions[model_name]['test_proba']
    
    # ========================================
    # VALIDATION SET METRICS
    # ========================================
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, average='macro', zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, average='macro', zero_division=0)
    val_f1 = f1_score(y_val, y_val_pred, average='macro', zero_division=0)
    val_mcc = matthews_corrcoef(y_val, y_val_pred)
    
    # AUROC (One-vs-Rest for multiclass)
    try:
        y_val_bin = label_binarize(y_val, classes=[0, 1, 2])
        val_auroc = roc_auc_score(y_val_bin, y_val_proba, average='macro', multi_class='ovr')
    except:
        val_auroc = np.nan
    
    # ========================================
    # TEST SET METRICS
    # ========================================
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, average='macro', zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, average='macro', zero_division=0)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)
    
    # AUROC for test set
    try:
        y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
        test_auroc = roc_auc_score(y_test_bin, y_test_proba, average='macro', multi_class='ovr')
    except:
        test_auroc = np.nan
    
    # Store results
    evaluation_results.append({
        'Model': model_name,
        'Val_Accuracy': val_accuracy,
        'Val_Precision': val_precision,
        'Val_Recall': val_recall,
        'Val_F1': val_f1,
        'Val_AUROC': val_auroc,
        'Val_MCC': val_mcc,
        'Test_Accuracy': test_accuracy,
        'Test_Precision': test_precision,
        'Test_Recall': test_recall,
        'Test_F1': test_f1,
        'Test_AUROC': test_auroc,
        'Test_MCC': test_mcc,
        'Training_Time': training_times[model_name]
    })

# Create results DataFrame
results_df = pd.DataFrame(evaluation_results)





                    COMPREHENSIVE MODEL EVALUATION

üìä Calculating comprehensive metrics for all models...
‚öôÔ∏è Evaluating Random Forest...
‚öôÔ∏è Evaluating Gradient Boosting...
‚öôÔ∏è Evaluating Logistic Regression...
‚öôÔ∏è Evaluating SVM...
‚öôÔ∏è Evaluating MLP...
‚öôÔ∏è Evaluating KNN...
‚öôÔ∏è Evaluating Decision Tree...
‚öôÔ∏è Evaluating Naive Bayes...
‚öôÔ∏è Evaluating Extra Trees...
‚öôÔ∏è Evaluating AdaBoost...


In [38]:
# ============================================================
# PART 2: Display Results Tables
# ============================================================
print("\n" + "="*80)
print("VALIDATION SET RESULTS")
print("="*80)

val_metrics = results_df[['Model', 'Val_Accuracy', 'Val_Precision', 'Val_Recall', 
                           'Val_F1', 'Val_AUROC', 'Val_MCC', 'Training_Time']].copy()
val_metrics.columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 
                       'AUROC', 'MCC', 'Time (s)']

print("\n")
print(val_metrics.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

print("\n" + "="*80)
print("TEST SET RESULTS (GENERALIZATION)")
print("="*80)

test_metrics = results_df[['Model', 'Test_Accuracy', 'Test_Precision', 'Test_Recall', 
                            'Test_F1', 'Test_AUROC', 'Test_MCC']].copy()
test_metrics.columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUROC', 'MCC']

print("\n")
print(test_metrics.to_string(index=False, float_format=lambda x: f'{x:.4f}'))






VALIDATION SET RESULTS


              Model  Accuracy  Precision  Recall  F1-Score  AUROC    MCC  Time (s)
      Random Forest    1.0000     1.0000  1.0000    1.0000 1.0000 1.0000    0.1408
  Gradient Boosting    1.0000     1.0000  1.0000    1.0000 1.0000 1.0000    1.0234
Logistic Regression    0.8263     0.6718  0.8850    0.7177 0.9585 0.6603    2.2052
                SVM    0.7682     0.6103  0.8079    0.6520 0.9322 0.5451    6.2668
                MLP    0.9455     0.8835  0.8627    0.8727 0.9888 0.8408    2.5152
                KNN    0.8263     0.6198  0.4641    0.4847 0.7614 0.3948    0.0010
      Decision Tree    1.0000     1.0000  1.0000    1.0000 1.0000 1.0000    0.0047
        Naive Bayes    0.8536     0.7103  0.7583    0.6568 0.9712 0.6524    0.0025
        Extra Trees    0.6652     0.5574  0.7390    0.5619 0.8927 0.4485    0.1091
           AdaBoost    1.0000     1.0000  1.0000    1.0000 1.0000 1.0000    0.3197

TEST SET RESULTS (GENERALIZATION)


              Model  Acc

In [39]:
# ============================================================
# PART 3: Identify Best Models (CORRECTED)
# ============================================================
print("\n" + "="*80)
print("BEST MODELS BY METRIC")
print("="*80)

best_models = {
    'Accuracy': results_df.loc[results_df['Test_Accuracy'].idxmax(), 'Model'],
    'F1-Score': results_df.loc[results_df['Test_F1'].idxmax(), 'Model'],
    'AUROC': results_df.loc[results_df['Test_AUROC'].idxmax(), 'Model'],
    'MCC': results_df.loc[results_df['Test_MCC'].idxmax(), 'Model'],
    'Precision': results_df.loc[results_df['Test_Precision'].idxmax(), 'Model'],
    'Recall': results_df.loc[results_df['Test_Recall'].idxmax(), 'Model']
}

# Mapping between display names and column names
metric_columns = {
    'Accuracy': 'Test_Accuracy',
    'F1-Score': 'Test_F1',
    'AUROC': 'Test_AUROC',
    'MCC': 'Test_MCC',
    'Precision': 'Test_Precision',
    'Recall': 'Test_Recall'
}

print("\nüèÜ Best Models (Test Set Performance):")
for metric, model in best_models.items():
    col_name = metric_columns[metric]
    value = results_df[results_df['Model'] == model][col_name].values[0]
    print(f"   {metric:12s}: {model:20s} ({value:.4f})")

# Overall best model (based on F1-Score)
best_model_name = results_df.loc[results_df['Test_F1'].idxmax(), 'Model']
best_f1 = results_df['Test_F1'].max()
best_auroc = results_df.loc[results_df['Model'] == best_model_name, 'Test_AUROC'].values[0]
best_acc = results_df.loc[results_df['Model'] == best_model_name, 'Test_Accuracy'].values[0]
best_mcc = results_df.loc[results_df['Model'] == best_model_name, 'Test_MCC'].values[0]

print(f"\nüéØ OVERALL BEST MODEL: {best_model_name}")
print(f"   Test Accuracy:  {best_acc:.4f} ({best_acc*100:.2f}%)")
print(f"   Test F1-Score:  {best_f1:.4f}")
print(f"   Test AUROC:     {best_auroc:.4f}")
print(f"   Test MCC:       {best_mcc:.4f}")


BEST MODELS BY METRIC

üèÜ Best Models (Test Set Performance):
   Accuracy    : Gradient Boosting    (1.0000)
   F1-Score    : Gradient Boosting    (1.0000)
   AUROC       : Random Forest        (1.0000)
   MCC         : Gradient Boosting    (1.0000)
   Precision   : Gradient Boosting    (1.0000)
   Recall      : Gradient Boosting    (1.0000)

üéØ OVERALL BEST MODEL: Gradient Boosting
   Test Accuracy:  1.0000 (100.00%)
   Test F1-Score:  1.0000
   Test AUROC:     1.0000
   Test MCC:       1.0000


In [40]:
# ============================================================
# PART 4: Generalization Analysis
# ============================================================
print("\n" + "="*80)
print("GENERALIZATION ANALYSIS (Val vs Test)")
print("="*80)

results_df['F1_Diff'] = results_df['Val_F1'] - results_df['Test_F1']
results_df['Acc_Diff'] = results_df['Val_Accuracy'] - results_df['Test_Accuracy']

print("\nüìâ Performance Drop (Validation ‚Üí Test):")
gen_analysis = results_df[['Model', 'Val_F1', 'Test_F1', 'F1_Diff', 
                            'Val_Accuracy', 'Test_Accuracy', 'Acc_Diff']].copy()
gen_analysis.columns = ['Model', 'Val F1', 'Test F1', 'F1 Drop', 
                        'Val Acc', 'Test Acc', 'Acc Drop']
print("\n")
print(gen_analysis.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# Find most stable model (smallest drop)
most_stable = results_df.loc[results_df['F1_Diff'].abs().idxmin(), 'Model']
print(f"\nüéØ Most Stable Model (best generalization): {most_stable}")

# ============================================================
# PART 5: Save Results
# ============================================================
results_df.to_csv('../data/model_evaluation_results.csv', index=False)
print(f"\nüíæ Results saved: ../data/model_evaluation_results.csv")

print("\n" + "="*80)
print("‚úÖ EVALUATION COMPLETE!")
print("="*80)


GENERALIZATION ANALYSIS (Val vs Test)

üìâ Performance Drop (Validation ‚Üí Test):


              Model  Val F1  Test F1  F1 Drop  Val Acc  Test Acc  Acc Drop
      Random Forest  1.0000   0.9990   0.0010   1.0000    0.9993    0.0007
  Gradient Boosting  1.0000   1.0000   0.0000   1.0000    1.0000    0.0000
Logistic Regression  0.7177   0.7066   0.0111   0.8263    0.8249    0.0015
                SVM  0.6520   0.6323   0.0196   0.7682    0.7653    0.0029
                MLP  0.8727   0.8522   0.0205   0.9455    0.9338    0.0118
                KNN  0.4847   0.4559   0.0288   0.8263    0.8102    0.0162
      Decision Tree  1.0000   1.0000   0.0000   1.0000    1.0000    0.0000
        Naive Bayes  0.6568   0.6363   0.0205   0.8536    0.8344    0.0191
        Extra Trees  0.5619   0.5530   0.0089   0.6652    0.6630    0.0022
           AdaBoost  1.0000   1.0000   0.0000   1.0000    1.0000    0.0000

üéØ Most Stable Model (best generalization): Gradient Boosting

üíæ Results saved: ..