# Rigorous Evaluation for Research Paper

This notebook addresses reviewer feedback by providing:
1. **Real vs Synthetic evaluation** - Separate performance on base data vs augmented data
2. **5-Fold Cross-Validation** - Statistical validity of results
3. **Ablation Studies** - Contribution of each component
4. **Error Analysis** - Confusion matrices and failure patterns

**Update**: Evaluating using the custom **Hierarchical Symptom-to-Disease** architecture (Category -> Disease) instead of flat classification.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json, joblib, warnings, sys, os, time, re
from typing import Dict, List, Any, Optional

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

project_root = Path(os.getcwd()).parent
sys.path.insert(0, str(project_root))

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (classification_report, accuracy_score, top_k_accuracy_score,
    confusion_matrix, f1_score, precision_score, recall_score)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Import custom hierarchical models
from models.architectures.symptom_classifier import SymptomCategoryClassifier, SymptomDiseaseClassifier

print(f"Project root: {project_root}")

Project root: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis


---
# Part 1: Load All Data Versions

In [3]:
# Data paths
base_data_path = project_root / "data" / "raw" / "symptoms" / "Disease and symptoms dataset Base.csv"
augmented_no_demo_path = project_root / "data" / "processed" / "symptoms" / "symptoms_augmented_no_demographics.csv"
augmented_with_demo_path = project_root / "data" / "processed" / "symptoms" / "symptoms_augmented_with_demographics.csv"

# Load symptom vocabulary
with open(project_root / "data" / "symptom_vocabulary.json") as f:
    symptom_cols = json.load(f)

print(f"Base data exists: {base_data_path.exists()}")
print(f"Augmented (no demo) exists: {augmented_no_demo_path.exists()}")
print(f"Augmented (with demo) exists: {augmented_with_demo_path.exists()}")

Base data exists: True
Augmented (no demo) exists: True
Augmented (with demo) exists: True


In [4]:
# Load data 
df_base = pd.read_csv(base_data_path)
print(f"Base dataset: {len(df_base):,} rows, {df_base['diseases'].nunique()} diseases")

df_augmented = pd.read_csv(augmented_no_demo_path)
print(f"Augmented dataset: {len(df_augmented):,} rows, {df_augmented['diseases'].nunique()} diseases")

df_demo = pd.read_csv(augmented_with_demo_path)
print(f"Augmented + Demographics: {len(df_demo):,} rows")

Base dataset: 246,945 rows, 773 diseases
Augmented dataset: 207,421 rows, 630 diseases
Augmented + Demographics: 207,421 rows


In [11]:
# FIX: Base dataset likely missing 'disease_category'. Map it using the official mapping file.
mapping_path = project_root / "data" / "disease_mapping.json"
if mapping_path.exists():
    with open(mapping_path, 'r') as f:
        disease_mapping = json.load(f)
    
    # Invert mapping: Disease -> Category
    disease_to_category = {}
    for category, diseases in disease_mapping.items():
        for disease in diseases:
            disease_to_category[disease] = category
            
    print(f"Loaded disease mapping for {len(disease_to_category)} diseases.")
else:
    print("Warning: disease_mapping.json not found. Falling back to augmented data for mapping.")
    # Fallback
    disease_to_category = df_augmented.set_index('diseases')['disease_category'].to_dict()

if 'disease_category' not in df_base.columns:
    print("\nAdding missing 'disease_category' column to base dataset...")
    df_base['disease_category'] = df_base['diseases'].map(disease_to_category).fillna("Unknown Type")
    
    # Check for unmapped
    unknown_count = (df_base['disease_category'] == "Unknown Type").sum()
    if unknown_count > 0:
        print(f"Note: {unknown_count} samples in base data could not be mapped to a category. Assigned to 'Unknown Type'.")
    print(f"Base data now has {len(df_base)} rows with categories.")

# FIX: LightGBM constraint on special characters in feature names
def sanitize_column_names(df):
    """Replace special characters in column names with underscores."""
    new_columns = []
    for col in df.columns:
        # Keep alphanumeric and underscores, replace others
        new_col = re.sub(r'[^A-Za-z0-9_]+', '_', col)
        new_columns.append(new_col)
    df.columns = new_columns
    return df

print("\nSanitizing feature names for LightGBM compatibility...")
df_base = sanitize_column_names(df_base)
df_augmented = sanitize_column_names(df_augmented)
df_demo = sanitize_column_names(df_demo)

Loaded disease mapping for 745 diseases.

Adding missing 'disease_category' column to base dataset...
Note: 7424 samples in base data could not be mapped to a category. Assigned to 'Unknown Type'.
Base data now has 246945 rows with categories.

Sanitizing feature names for LightGBM compatibility...


In [5]:
def filter_min_samples(df, min_samples=2):
    """Filter to keep only diseases with at least min_samples."""
    disease_counts = df['diseases'].value_counts()
    valid_diseases = disease_counts[disease_counts >= min_samples].index
    df_filtered = df[df['diseases'].isin(valid_diseases)].copy()
    
    print(f"Original: {len(df):,} rows, {df['diseases'].nunique()} diseases")
    print(f"Filtered: {len(df_filtered):,} rows, {df_filtered['diseases'].nunique()} diseases")
    print(f"Removed: {df['diseases'].nunique() - df_filtered['diseases'].nunique()} diseases with <{min_samples} samples")
    
    return df_filtered

# Filter for stable evaluation
df_base_filtered = filter_min_samples(df_base, min_samples=5)
df_augmented_filtered = filter_min_samples(df_augmented, min_samples=5)
df_demo_filtered = filter_min_samples(df_demo, min_samples=5)

Original: 246,945 rows, 773 diseases
Filtered: 246,823 rows, 721 diseases
Removed: 52 diseases with <5 samples
Original: 207,421 rows, 630 diseases
Filtered: 207,387 rows, 615 diseases
Removed: 15 diseases with <5 samples
Original: 207,421 rows, 630 diseases
Filtered: 207,387 rows, 615 diseases
Removed: 15 diseases with <5 samples


---
# Part 2: Define Hierarchical Model Utilities

We need to replicate the 2-stage inference process:
1. **Category Classifier** identifies likely disease categories
2. **Specialist Classifiers** predict specific diseases within those categories

In [6]:
def prepare_features(df, feature_cols):
    """Prepare features and labels from dataframe."""
    available_cols = [c for c in feature_cols if c in df.columns]
    # Return DataFrame to preserve column names for feature importance/LGBM
    X = df[available_cols]
    y_disease = df['diseases'].values
    return X, y_disease, available_cols

def train_hierarchical_model(X_train, y_train, df_train_full):
    """Train hierarchical model (Category -> Specialist Disease Classifiers)."""
    # 1. Train Category Classifier
    print("  Training Category Classifier...")
    y_categories = df_train_full['disease_category'].values
    
    # Encode categories because LGBM Classifier expects encoded targets (usually)
    cat_encoder = LabelEncoder()
    y_cat_encoded = cat_encoder.fit_transform(y_categories)
    
    cat_clf = SymptomCategoryClassifier(n_estimators=100)
    cat_clf.fit(X_train, y_cat_encoded)
    
    # Store encoder in clf for retrieval
    cat_clf.encoder_ = cat_encoder
    
    # 2. Train specialist models
    specialist_models = {}
    unique_categories = np.unique(y_categories)
    
    print(f"  Training {len(unique_categories)} Specialist Models...")
    for category in unique_categories:
        mask = y_categories == category
        X_cat = X_train[mask]
        y_cat = y_train[mask]
        
        if len(y_cat) < 5: continue
            
        model = SymptomDiseaseClassifier(category=category, n_estimators=100)
        model.fit(X_cat, y_cat)
        specialist_models[category] = model
        
    return cat_clf, specialist_models

def predict_hierarchical(X_test, cat_clf, specialist_models, all_possible_diseases):
    """Hierarchical prediction logic."""
    # Get category probabilities
    cat_probs = cat_clf.predict_proba(X_test)
    cat_encoder = cat_clf.encoder_
    
    # Initialize output matrix
    final_probs = np.zeros((len(X_test), len(all_possible_diseases)))
    disease_to_idx = {d: i for i, d in enumerate(all_possible_diseases)}
    
    # Route predictions
    # cat_clf.categories consists of the sorted integer labels from training
    for i, cat_idx in enumerate(cat_clf.categories):
        # Decode to string name to find correct specialist
        cat_name = cat_encoder.inverse_transform([cat_idx])[0]
        
        if cat_name not in specialist_models: continue
        
        model = specialist_models[cat_name]
        # Predict only for this category's diseases
        specialist_probs = model.predict_proba(X_test)
        
        # Weight by P(Category)
        weight = cat_probs[:, i][:, np.newaxis]
        
        # Add weighted probs to final matrix
        for local_idx, disease in enumerate(model.diseases):
            if disease in disease_to_idx:
                global_idx = disease_to_idx[disease]
                final_probs[:, global_idx] += weight.ravel() * specialist_probs[:, local_idx]
                
    return final_probs

def evaluate_hierarchical(X_train, X_test, y_train, y_test, df_train_full, all_disease_classes):
    """Train and evaluate using hierarchical approach."""
    # Train
    cat_clf, specialist_models = train_hierarchical_model(X_train, y_train, df_train_full)
    
    # Predict
    y_proba = predict_hierarchical(X_test, cat_clf, specialist_models, all_disease_classes)
    
    # Map true labels to indices
    y_pred_idx = np.argmax(y_proba, axis=1)
    # We need to map y_test strings to indices in all_disease_classes
    disease_to_idx = {d: i for i, d in enumerate(all_disease_classes)}
    y_test_idx = np.array([disease_to_idx.get(d, -1) for d in y_test])
    
    # Filter out valid labels (in case test set has disease not in training classes, though unlikely with split)
    valid_mask = y_test_idx != -1
    
    all_labels = np.arange(len(all_disease_classes))
    
    results = {
        'Top-1': accuracy_score(y_test_idx[valid_mask], y_pred_idx[valid_mask]),
        'Top-3': top_k_accuracy_score(y_test_idx[valid_mask], y_proba[valid_mask], k=3, labels=all_labels),
        'Top-5': top_k_accuracy_score(y_test_idx[valid_mask], y_proba[valid_mask], k=5, labels=all_labels),
        'Macro-F1': f1_score(y_test_idx[valid_mask], y_pred_idx[valid_mask], average='macro')
    }
    
    return results, cat_clf, specialist_models, y_proba

In [7]:
# Define feature columns (same for all)
non_feature_cols = ['diseases', 'disease_category', 'symptoms', 'age', 'sex', 'age_normalized', 'sex_encoded']
feature_cols_base = [c for c in df_base.columns if c not in non_feature_cols]
print(f"Feature columns: {len(feature_cols_base)}")

Feature columns: 377


---
# Part 3: Evaluation on Base/Real Data Only

In [8]:
print("="*70)
print("EVALUATION ON BASE/REAL DATA ONLY (Hierarchical)")
print("="*70)

X_base, y_base, cols_base = prepare_features(df_base_filtered, feature_cols_base)
all_diseases_base = np.unique(y_base)

# Split indices
indices = np.arange(len(df_base_filtered))
train_idx, test_idx = train_test_split(
    indices, test_size=0.1, random_state=42, stratify=y_base
)

X_train_base = X_base.iloc[train_idx]
X_test_base = X_base.iloc[test_idx]
y_train_base = y_base[train_idx]
y_test_base = y_base[test_idx]
df_train_base = df_base_filtered.iloc[train_idx]

results_base, cat_clf_base, _, _ = evaluate_hierarchical(
    X_train_base, X_test_base, y_train_base, y_test_base, df_train_base, all_diseases_base
)

print(f"\nResults on Real Data Only:")
for metric, value in results_base.items():
    print(f"  {metric}: {value*100:.2f}%")

EVALUATION ON BASE/REAL DATA ONLY (Hierarchical)
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (222140, 377)
  Training 17 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2486, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (16687, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (20589, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6949, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (22510, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (139, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (24768, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Onc

---
# Part 4: Evaluation on Augmented Data

In [9]:
print("="*70)
print("EVALUATION ON AUGMENTED DATA (Hierarchical)")
print("="*70)

feature_cols_aug = [c for c in df_augmented_filtered.columns if c not in non_feature_cols]
X_aug, y_aug, cols_aug = prepare_features(df_augmented_filtered, feature_cols_aug)
all_diseases_aug = np.unique(y_aug)

indices_aug = np.arange(len(df_augmented_filtered))
train_idx_aug, test_idx_aug = train_test_split(
    indices_aug, test_size=0.1, random_state=42, stratify=y_aug
)

X_train_aug = X_aug.iloc[train_idx_aug]
X_test_aug = X_aug.iloc[test_idx_aug]
y_train_aug = y_aug[train_idx_aug]
y_test_aug = y_aug[test_idx_aug]
df_train_aug = df_augmented_filtered.iloc[train_idx_aug]

results_aug, cat_clf_aug, _, _ = evaluate_hierarchical(
    X_train_aug, X_test_aug, y_train_aug, y_test_aug, df_train_aug, all_diseases_aug
)

print(f"\nResults on Augmented Data:")
for metric, value in results_aug.items():
    print(f"  {metric}: {value*100:.2f}%")

EVALUATION ON AUGMENTED DATA (Hierarchical)
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (186648, 375)
  Training 14 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (15056, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (13819, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6798, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (19178, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (23909, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (4010, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Infectious Diseases' with shape (5438, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Mental and Behavioral Health' with

In [10]:
# Comparison summary
print("\n" + "="*70)
print("COMPARISON: Real Data vs Augmented Data (Hierarchical)")
print("="*70)

comparison_df = pd.DataFrame({
    'Metric': list(results_base.keys()),
    'Real Only': [f"{v*100:.2f}%" for v in results_base.values()],
    'Augmented': [f"{v*100:.2f}%" for v in results_aug.values()],
    'Difference': [f"{(results_aug[k] - results_base[k])*100:+.2f}%" for k in results_base.keys()]
})

print(comparison_df.to_string(index=False))


COMPARISON: Real Data vs Augmented Data (Hierarchical)
  Metric Real Only Augmented Difference
   Top-1    80.78%    80.46%     -0.32%
   Top-3    93.30%    93.52%     +0.22%
   Top-5    95.96%    96.27%     +0.31%
Macro-F1    67.97%    66.76%     -1.21%


---
# Part 5: 5-Fold Cross-Validation

In [11]:
def cross_validate(X, y, df_full, disease_classes, n_folds=5):
    """Perform stratified k-fold cross-validation with Hierarchical model."""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    fold_results = []
    indices = np.arange(len(y))
    
    for fold, (train_idx_cv, test_idx_cv) in enumerate(skf.split(indices, y)):
        X_train_cv = X.iloc[train_idx_cv]
        X_test_cv = X.iloc[test_idx_cv]
        y_train_cv = y[train_idx_cv]
        y_test_cv = y[test_idx_cv]
        df_train_cv = df_full.iloc[train_idx_cv]
        
        results, _, _, _ = evaluate_hierarchical(
            X_train_cv, X_test_cv, y_train_cv, y_test_cv, df_train_cv, disease_classes
        )
        fold_results.append(results)
        
        print(f"  Fold {fold+1}: Top-1={results['Top-1']*100:.2f}%")
    
    # Aggregate results
    summary = {}
    for metric in fold_results[0].keys():
        values = [r[metric] for r in fold_results]
        summary[metric] = {
            'mean': np.mean(values),
            'std': np.std(values)
        }
    
    return summary

In [12]:
# 5-Fold CV on BASE data
print("="*70)
print("5-FOLD CV ON BASE/REAL DATA")
print("="*70)

cv_results_base = cross_validate(X_base, y_base, df_base_filtered, all_diseases_base)

print(f"\nCross-Validation Results (Real Data):")
for metric, stats in cv_results_base.items():
    print(f"  {metric}: {stats['mean']*100:.2f}% Â± {stats['std']*100:.2f}%")

5-FOLD CV ON BASE/REAL DATA
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (197458, 377)
  Training 17 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2210, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (14837, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (18305, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6180, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (20006, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (123, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (22013, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (35

In [13]:
# 5-Fold CV on AUGMENTED data
print("="*70)
print("5-FOLD CV ON AUGMENTED DATA")
print("="*70)

cv_results_aug = cross_validate(X_aug, y_aug, df_augmented_filtered, all_diseases_aug)

print(f"\nCross-Validation Results (Augmented):")
for metric, stats in cv_results_aug.items():
    print(f"  {metric}: {stats['mean']*100:.2f}% Â± {stats['std']*100:.2f}%")

5-FOLD CV ON AUGMENTED DATA
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (165909, 375)
  Training 14 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (13384, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (12283, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6042, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (17047, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (21255, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (3563, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Infectious Diseases' with shape (4827, 375)
DEBUG: Training SymptomDiseaseClassifier for category 'Mental and Behavioral Health' with shape (12874, 3

---
# Part 6: Ablation (Demographics)

In [14]:
# Ablation: With vs Without Demographics
print("="*70)
print("ABLATION: WITH VS WITHOUT DEMOGRAPHICS")
print("="*70)

# Prepare demo data
df_demo_filtered['sex_encoded'] = (df_demo_filtered['sex'] == 'M').astype(int)
df_demo_filtered['age_normalized'] = df_demo_filtered['age'] / 100.0

feature_cols_demo = [c for c in df_augmented_filtered.columns if c not in non_feature_cols] + ['age_normalized', 'sex_encoded']

# Ensure columns exist
feature_cols_demo = [c for c in feature_cols_demo if c in df_demo_filtered.columns]

X_demo = df_demo_filtered[feature_cols_demo]
y_demo = df_demo_filtered['diseases'].values
all_diseases_demo = np.unique(y_demo)

indices_demo = np.arange(len(df_demo_filtered))
train_idx_demo, test_idx_demo = train_test_split(
    indices_demo, test_size=0.1, random_state=42, stratify=y_demo
)

X_train_demo = X_demo.iloc[train_idx_demo]
X_test_demo = X_demo.iloc[test_idx_demo]
y_train_demo = y_demo[train_idx_demo]
y_test_demo = y_demo[test_idx_demo]
df_train_demo = df_demo_filtered.iloc[train_idx_demo]

results_demo, _, _, _ = evaluate_hierarchical(
    X_train_demo, X_test_demo, y_train_demo, y_test_demo, df_train_demo, all_diseases_demo
)

print(f"\nWith Demographics: Top-1={results_demo['Top-1']*100:.2f}%")
print(f"Without Demographics: Top-1={results_aug['Top-1']*100:.2f}%")
print(f"Demographics Contribution: {(results_demo['Top-1'] - results_aug['Top-1'])*100:+.2f}%")

ABLATION: WITH VS WITHOUT DEMOGRAPHICS
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (186648, 377)
  Training 14 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (15056, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (13819, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6798, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (19178, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (23909, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (4010, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Infectious Diseases' with shape (5438, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Mental and Behavioral Health' with shap

In [15]:
# Summary: Ablation Results
print("\n" + "="*70)
print("ABLATION STUDY SUMMARY")
print("="*70)

ablation_df = pd.DataFrame({
    'Configuration': ['Real Data Only', 'Real + Synthetic', 'Real + Synthetic + Demographics'],
    'Top-1': [results_base['Top-1']*100, results_aug['Top-1']*100, results_demo['Top-1']*100],
    'Top-5': [results_base['Top-5']*100, results_aug['Top-5']*100, results_demo['Top-5']*100]
})

print(ablation_df.to_string(index=False))


ABLATION STUDY SUMMARY
                  Configuration     Top-1     Top-5
                 Real Data Only 80.780294 95.964834
               Real + Synthetic 80.457110 96.272723
Real + Synthetic + Demographics 82.424418 97.159940


---
# Part 7: Error Analysis

In [16]:
# Re-run prediction to get confusion matrix from Hierarchical model (Base Data)
print("Generating confusion matrix for Real Data...")

cat_clf_err, specialist_err = train_hierarchical_model(X_train_base, y_train_base, df_train_base)
y_proba_err = predict_hierarchical(X_test_base, cat_clf_err, specialist_err, all_diseases_base)

# Map to indices
y_pred_err_idx = np.argmax(y_proba_err, axis=1)
disease_to_idx = {d: i for i, d in enumerate(all_diseases_base)}
y_test_err_idx = np.array([disease_to_idx.get(d, -1) for d in y_test_base])

# Filter valid only
valid = y_test_err_idx != -1
cm = confusion_matrix(y_test_err_idx[valid], y_pred_err_idx[valid])

print(f"Confusion matrix shape: {cm.shape}")

Generating confusion matrix for Real Data...
  Training Category Classifier...
DEBUG: Training SymptomCategoryClassifier with shape (222140, 377)
  Training 17 Specialist Models...
DEBUG: Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2486, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (16687, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Dermatological' with shape (20589, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6949, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (22510, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (139, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (24768, 377)
DEBUG: Training SymptomDiseaseClassifier for category 'Hematology and Oncolog

In [17]:
# Find most confused pairs
def find_confusion_pairs(cm, disease_names, top_k=10):
    """Find most confused disease pairs."""
    n = cm.shape[0]
    pairs = []
    
    for i in range(n):
        for j in range(n):
            if i != j and cm[i, j] > 0:
                pairs.append({
                    'true': disease_names[i],
                    'pred': disease_names[j],
                    'count': cm[i, j]
                })
    
    pairs.sort(key=lambda x: -x['count'])
    return pairs[:top_k]

confused_pairs = find_confusion_pairs(cm, all_diseases_base, top_k=15)

print("Top 15 Most Confused Disease Pairs:")
print("-" * 80)
for p in confused_pairs:
    print(f"  {p['count']:3d}x: {p['true'][:35]:35s} â†’ {p['pred'][:35]}")

Top 15 Most Confused Disease Pairs:
--------------------------------------------------------------------------------
   40x: neurosis                            â†’ induced abortion
   27x: cholecystitis                       â†’ galactorrhea of unknown cause
   23x: uterine cancer                      â†’ spondylitis
   22x: induced abortion                    â†’ neurosis
   21x: dental caries                       â†’ polycystic ovarian syndrome (pcos)
   21x: guillain barre syndrome             â†’ tonsillar hypertrophy
   21x: skin cancer                         â†’ sinus bradycardia
   20x: pseudotumor cerebri                 â†’ scabies
   19x: plantar fasciitis                   â†’ acute bronchospasm
   19x: scabies                             â†’ peritonitis
   18x: male genitalia infection            â†’ peritonitis
   17x: blepharitis                         â†’ cornea infection
   17x: syringomyelia                       â†’ bladder disorder
   16x: cysticercosis          

In [18]:
# Per-class accuracy
def per_class_accuracy(y_true_idx, y_pred_idx, disease_names, top_worst=10):
    """Calculate per-class accuracy."""
    accuracies = []
    
    for i, cls in enumerate(disease_names):
        mask = y_true_idx == i
        if mask.sum() > 0:
            acc = (y_pred_idx[mask] == i).mean()
            accuracies.append((cls, acc, mask.sum()))
    
    accuracies.sort(key=lambda x: x[1])
    return accuracies[:top_worst]

worst_classes = per_class_accuracy(y_test_err_idx[valid], y_pred_err_idx[valid], all_diseases_base, top_worst=15)

print("\nWorst Performing Diseases (Lowest Accuracy):")
print("-" * 60)
for cls, acc, count in worst_classes:
    print(f"  {acc*100:5.1f}% ({count:3d} samples): {cls}")


Worst Performing Diseases (Lowest Accuracy):
------------------------------------------------------------
    0.0% (  1 samples): acute fatty liver of pregnancy (aflp)
    0.0% (  4 samples): anemia of chronic disease
    0.0% (  1 samples): aphakia
    0.0% (  1 samples): birth trauma
    0.0% (  1 samples): blepharospasm
    0.0% (  2 samples): breast cyst
    0.0% (  1 samples): chronic kidney disease
    0.0% (  1 samples): cryptococcosis
    0.0% (  1 samples): cushing syndrome
    0.0% (  1 samples): cystic fibrosis
    0.0% (  1 samples): decubitus ulcer
    0.0% (  1 samples): diabetes insipidus
    0.0% (  2 samples): dislocation of the ankle
    0.0% (  1 samples): edward syndrome
    0.0% (  1 samples): empyema


---
# Part 8: Save Final Results

In [19]:
# Save results to JSON
final_results = {
    'real_only': results_base,
    'augmented': results_aug,
    'augmented_demo': results_demo,
    'cv_real': {k: {'mean': v['mean'], 'std': v['std']} for k, v in cv_results_base.items()},
    'cv_augmented': {k: {'mean': v['mean'], 'std': v['std']} for k, v in cv_results_aug.items()}
}

with open(project_root / 'notebooks' / 'figures' / 'rigorous_eval_results.json', 'w') as f:
    json.dump(final_results, f, indent=2, default=float)

print("Results saved to figures/rigorous_eval_results.json")

# Print final summary table
print("\n" + "="*70)
print("FINAL RESULTS SUMMARY (Hierarchical Model)")
print("="*70)
print(ablation_df.to_string(index=False))
print("\nCross-Validation (Real): ", f"{cv_results_base['Top-1']['mean']*100:.2f}% Â± {cv_results_base['Top-1']['std']*100:.2f}%")

Results saved to figures/rigorous_eval_results.json

FINAL RESULTS SUMMARY (Hierarchical Model)
                  Configuration     Top-1     Top-5
                 Real Data Only 80.780294 95.964834
               Real + Synthetic 80.457110 96.272723
Real + Synthetic + Demographics 82.424418 97.159940

Cross-Validation (Real):  80.55% Â± 0.07%


---
# Part 9: Semantic Encoder Evaluation

Evaluates how well the `SemanticSymptomEncoder` converts raw symptom text into the 377-dim evidence vector.

**Metrics:**
- **Precision@K**: Of top-K activated symptoms, how many are ground truth?
- **Recall@K**: Of ground truth symptoms, how many are in top-K?
- **MRR**: Mean Reciprocal Rank of ground truth symptoms


In [None]:
def evaluate_encoder(encoder, df: pd.DataFrame, symptom_cols: List[str], n_samples: int = 5000) -> Dict[str, float]:
    """
    Evaluate semantic encoder using IR-style metrics.
    
    Synthesizes input text by joining active symptom names, then measures
    how well the encoder recovers those symptoms in its output vector.
    """
    sample_df = df.sample(n=min(n_samples, len(df)), random_state=42)
    
    # Map column names to encoder symptoms
    encoder_symptoms = set(encoder.symptoms)
    col_to_encoder = {}
    for col in symptom_cols:
        normalized = col.lower().replace('_', ' ')
        if normalized in encoder_symptoms:
            col_to_encoder[col] = normalized
        elif col in encoder_symptoms:
            col_to_encoder[col] = col
        elif col.lower() in encoder_symptoms:
            col_to_encoder[col] = col.lower()
    
    print(f"  Matched {len(col_to_encoder)} symptoms between dataset and encoder")
    
    precision_at_5, precision_at_10 = [], []
    recall_at_5, recall_at_10 = [], []
    mrr_scores = []
    
    for idx, row in sample_df.iterrows():
        gt_symptoms = set()
        active_symptom_names = []
        
        for col, enc_name in col_to_encoder.items():
            if col in row and row[col] > 0.5:
                gt_symptoms.add(enc_name)
                active_symptom_names.append(enc_name)
        
        if len(gt_symptoms) == 0:
            continue
        
        # Synthesize text from active symptoms
        synthesized_text = ", ".join(active_symptom_names)
        result = encoder.encode_symptoms(synthesized_text)
        symptom_vector = result['symptom_vector']
        
        # Get top-K symptoms from encoder
        top_10 = encoder.get_top_symptoms(symptom_vector, top_k=10, threshold=0.0)
        top_10_names = [s[0] for s in top_10]
        top_5_names = top_10_names[:5]
        
        # Precision@K
        hits_5 = len(set(top_5_names) & gt_symptoms)
        hits_10 = len(set(top_10_names) & gt_symptoms)
        precision_at_5.append(hits_5 / 5)
        precision_at_10.append(hits_10 / 10)
        
        # Recall@K
        recall_at_5.append(hits_5 / len(gt_symptoms))
        recall_at_10.append(hits_10 / len(gt_symptoms))
        
        # MRR
        ranks = [1.0 / (i + 1) for i, (name, _) in enumerate(top_10) if name in gt_symptoms]
        mrr_scores.append(max(ranks) if ranks else 0.0)
    
    return {
        'Precision@5': np.mean(precision_at_5) if precision_at_5 else 0.0,
        'Precision@10': np.mean(precision_at_10) if precision_at_10 else 0.0,
        'Recall@5': np.mean(recall_at_5) if recall_at_5 else 0.0,
        'Recall@10': np.mean(recall_at_10) if recall_at_10 else 0.0,
        'MRR': np.mean(mrr_scores) if mrr_scores else 0.0,
        'n_samples': len(precision_at_5)
    }


def evaluate_pipeline(
    encoder, cat_clf, specialist_models: Dict, predict_fn,
    df: pd.DataFrame, symptom_cols: List[str], all_diseases: List[str], 
    n_samples: int = 5000
) -> Dict[str, float]:
    """
    Evaluate full end-to-end pipeline: synthesized text â†’ encoder â†’ classifier.
    """
    sample_df = df.sample(n=min(n_samples, len(df)), random_state=42)
    
    # Map encoder symptoms to classifier features
    encoder_to_col = {}
    for s in encoder.symptoms:
        if s in symptom_cols:
            encoder_to_col[s] = s
        else:
            normalized = s.replace(' ', '_')
            if normalized in symptom_cols:
                encoder_to_col[s] = normalized
    
    col_to_encoder = {v: k for k, v in encoder_to_col.items()}
    print(f"  Mapped {len(encoder_to_col)} encoder symptoms to classifier features")
    
    y_true, y_proba_all = [], []
    
    for idx, row in sample_df.iterrows():
        # Synthesize text from active symptoms
        active_symptoms = [col_to_encoder[col] for col in symptom_cols 
                          if col in row and row[col] > 0.5 and col in col_to_encoder]
        
        if len(active_symptoms) == 0:
            continue
        
        synthesized_text = ", ".join(active_symptoms)
        result = encoder.encode_symptoms(synthesized_text)
        symptom_vector = result['symptom_vector']
        
        # Build classifier feature vector
        features = np.zeros(len(symptom_cols))
        for i, col in enumerate(symptom_cols):
            if col in col_to_encoder:
                enc_idx = encoder.symptom_to_idx.get(col_to_encoder[col], -1)
                if enc_idx >= 0:
                    features[i] = symptom_vector[enc_idx]
        
        X_test_row = pd.DataFrame([features], columns=symptom_cols)
        y_proba = predict_fn(X_test_row, cat_clf, specialist_models, all_diseases)
        
        y_true.append(row['diseases'])
        y_proba_all.append(y_proba[0])
    
    if len(y_proba_all) == 0:
        return {'Pipeline Top-1': 0.0, 'Pipeline Top-3': 0.0, 'Pipeline Top-5': 0.0, 'n_samples': 0}
    
    y_proba_all = np.array(y_proba_all)
    y_true = np.array(y_true)
    
    disease_to_idx = {d: i for i, d in enumerate(all_diseases)}
    y_true_idx = np.array([disease_to_idx.get(d, -1) for d in y_true])
    valid = y_true_idx != -1
    
    if valid.sum() == 0:
        return {'Pipeline Top-1': 0.0, 'Pipeline Top-3': 0.0, 'Pipeline Top-5': 0.0, 'n_samples': 0}
    
    y_pred_idx = np.argmax(y_proba_all, axis=1)
    all_labels = np.arange(len(all_diseases))
    
    return {
        'Pipeline Top-1': accuracy_score(y_true_idx[valid], y_pred_idx[valid]),
        'Pipeline Top-3': top_k_accuracy_score(y_true_idx[valid], y_proba_all[valid], k=3, labels=all_labels),
        'Pipeline Top-5': top_k_accuracy_score(y_true_idx[valid], y_proba_all[valid], k=5, labels=all_labels),
        'n_samples': int(valid.sum())
    }


In [None]:
from models.architectures.semantic_symptom_encoder import SemanticSymptomEncoder

print("Loading SemanticSymptomEncoder...")
encoder = SemanticSymptomEncoder(device='cpu')
print(f"Encoder ready with {len(encoder.symptoms)} symptoms")


Loading SemanticSymptomEncoder...
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 458 symptoms
Encoder ready with 458 symptoms


In [None]:
print("="*70)
print("SEMANTIC ENCODER EVALUATION")
print("="*70)

# Get symptom columns (excluding non-feature columns)
symptom_cols_eval = [c for c in df_base_filtered.columns if c not in non_feature_cols]

print("\nðŸ“Š Encoder on BASE (Real) Data:")
encoder_metrics_base = evaluate_encoder(encoder, df_base_filtered, symptom_cols_eval, n_samples=5000)
print(f"  Precision@5:  {encoder_metrics_base['Precision@5']*100:.2f}%")
print(f"  Precision@10: {encoder_metrics_base['Precision@10']*100:.2f}%")
print(f"  Recall@10:    {encoder_metrics_base['Recall@10']*100:.2f}%")
print(f"  MRR:          {encoder_metrics_base['MRR']:.4f}")
print(f"  Samples:      {encoder_metrics_base['n_samples']}")

print("\nðŸ“Š Encoder on AUGMENTED Data:")
symptom_cols_aug_eval = [c for c in df_augmented_filtered.columns if c not in non_feature_cols]
encoder_metrics_aug = evaluate_encoder(encoder, df_augmented_filtered, symptom_cols_aug_eval, n_samples=5000)
print(f"  Precision@5:  {encoder_metrics_aug['Precision@5']*100:.2f}%")
print(f"  Precision@10: {encoder_metrics_aug['Precision@10']*100:.2f}%")
print(f"  Recall@10:    {encoder_metrics_aug['Recall@10']*100:.2f}%")
print(f"  MRR:          {encoder_metrics_aug['MRR']:.4f}")
print(f"  Samples:      {encoder_metrics_aug['n_samples']}")

# Comparison
print("\nðŸ“ˆ Base vs Augmented Comparison:")
print(f"  Precision@5 diff:  {(encoder_metrics_aug['Precision@5'] - encoder_metrics_base['Precision@5'])*100:+.2f}%")
print(f"  Recall@10 diff:    {(encoder_metrics_aug['Recall@10'] - encoder_metrics_base['Recall@10'])*100:+.2f}%")

SEMANTIC ENCODER EVALUATION
Matched 361 symptoms between dataset and encoder

Encoder Evaluation Results (5000 samples):
  Precision@5:  74.70%
  Precision@10: 50.92%
  Recall@5:     78.04%
  Recall@10:    99.62%
  MRR:          0.8315


---
# Part 10: End-to-End Pipeline Evaluation
 
Tests the full pipeline: **Symptom text â†’ Encoder â†’ Hierarchical Classifier â†’ Predictions**
 
Compares how using the encoder (vs pre-computed binary features) affects classification accuracy.

In [None]:
print("\n" + "="*70)
print("END-TO-END PIPELINE EVALUATION")
print("="*70)

print("\nðŸ”— Pipeline on BASE Data:")
pipeline_metrics_base = evaluate_pipeline(
    encoder, cat_clf_base, specialist_err,
    predict_hierarchical,
    df_base_filtered, cols_base, all_diseases_base,
    n_samples=5000
)
print(f"  Pipeline Top-1: {pipeline_metrics_base['Pipeline Top-1']*100:.2f}%")
print(f"  Pipeline Top-3: {pipeline_metrics_base['Pipeline Top-3']*100:.2f}%")
print(f"  Pipeline Top-5: {pipeline_metrics_base['Pipeline Top-5']*100:.2f}%")
print(f"  Samples:        {pipeline_metrics_base['n_samples']}")

print("\nðŸ“Š Comparison to Pre-computed Binary Features:")
print(f"  Pre-computed Top-1: {results_base['Top-1']*100:.2f}%")
print(f"  Pipeline Top-1:     {pipeline_metrics_base['Pipeline Top-1']*100:.2f}%")
degradation = (results_base['Top-1'] - pipeline_metrics_base['Pipeline Top-1']) * 100
print(f"  Encoder Effect:     {-degradation:+.2f}%")

END-TO-END PIPELINE EVALUATION


NameError: name 'cat_clf_base' is not defined

In [None]:
final_results['encoder_base'] = encoder_metrics_base
final_results['encoder_augmented'] = encoder_metrics_aug
final_results['pipeline_base'] = pipeline_metrics_base

with open(project_root / 'notebooks' / 'figures' / 'rigorous_eval_results.json', 'w') as f:
    json.dump(final_results, f, indent=2, default=float)

print("\nâœ… Updated results saved to figures/rigorous_eval_results.json")

# Final summary table
print("\n" + "="*70)
print("COMPLETE EVALUATION SUMMARY")
print("="*70)
print("\nðŸ“Š Hierarchical Classifier (binary features):")
print(f"   Base Top-1: {results_base['Top-1']*100:.2f}% | Augmented Top-1: {results_aug['Top-1']*100:.2f}%")
print("\nðŸ”¤ Semantic Encoder:")
print(f"   Base P@5:  {encoder_metrics_base['Precision@5']*100:.1f}% | Aug P@5:  {encoder_metrics_aug['Precision@5']*100:.1f}%")
print(f"   Base MRR:  {encoder_metrics_base['MRR']:.3f}       | Aug MRR:  {encoder_metrics_aug['MRR']:.3f}")
print("\nðŸ”— End-to-End Pipeline:")
print(f"   Pipeline Top-1: {pipeline_metrics_base['Pipeline Top-1']*100:.2f}%")
print(f"   Effect of using encoder: {-degradation:+.2f}% vs binary features")
