In [10]:
# ==== Resume / Checkpoint Utilities (Fixed) ====
import os, pickle
from pathlib import Path

CKPT_DIR = Path('checkpoints')
CKPT_DIR.mkdir(exist_ok=True)

def get_cv_state_file(cv_name: str) -> Path:
    """Get state file path for a specific CV run."""
    return CKPT_DIR / f'cv_state_{cv_name}.pkl'

def load_cv_state(cv_name: str) -> dict:
    """Load checkpoint state for a CV run."""
    state_file = get_cv_state_file(cv_name)
    if state_file.exists():
        with open(state_file, 'rb') as f:
            state = pickle.load(f)
            print(f"  Loaded checkpoint for '{cv_name}': {len(state.get('fold_results', {}))} folds completed")
            return state
    return {'fold_results': {}}  # fold_idx -> results dict

def save_cv_state(cv_name: str, state: dict):
    """Save checkpoint state for a CV run."""
    state_file = get_cv_state_file(cv_name)
    with open(state_file, 'wb') as f:
        pickle.dump(state, f)

def clear_cv_state(cv_name: str):
    """Clear checkpoint for a CV run (use when starting fresh)."""
    state_file = get_cv_state_file(cv_name)
    if state_file.exists():
        os.remove(state_file)
        print(f"  Cleared checkpoint for '{cv_name}'")

print('Resume utilities loaded. Checkpoints stored in:', CKPT_DIR.absolute())


Resume utilities loaded. Checkpoints stored in: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\notebooks\checkpoints


# Rigorous Evaluation for Research Paper

This notebook addresses reviewer feedback by providing:
1. **Real vs Synthetic evaluation** - Separate performance on base data vs augmented data
2. **5-Fold Cross-Validation** - Statistical validity of results
3. **Ablation Studies** - Contribution of each component
4. **Error Analysis** - Confusion matrices and failure patterns

**Update**: Evaluating using the custom **Hierarchical Symptom-to-Disease** architecture (Category -> Disease) instead of flat classification.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json, joblib, warnings, sys, os, time, re
from typing import Dict, List, Any, Optional

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

project_root = Path(os.getcwd()).parent
sys.path.insert(0, str(project_root))

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (classification_report, accuracy_score, top_k_accuracy_score,
    confusion_matrix, f1_score, precision_score, recall_score)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Import custom hierarchical models
from models.architectures.symptom_classifier import SymptomCategoryClassifier, SymptomDiseaseClassifier

print(f"Project root: {project_root}")



Project root: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis


---
# Part 1: Load All Data Versions

In [2]:
# Data paths
base_data_path = project_root / "data" / "processed" / "symptoms" / "symptoms_to_disease_cleaned.csv"
augmented_no_demo_path = project_root / "data" / "processed" / "symptoms" / "symptoms_augmented_no_demographics.csv"
augmented_with_demo_path = project_root / "data" / "processed" / "symptoms" / "symptoms_augmented_with_demographics.csv"

# Load symptom vocabulary
with open(project_root / "data" / "symptom_vocabulary.json") as f:
    symptom_cols = json.load(f)

print(f"Base data exists: {base_data_path.exists()}")
print(f"Augmented (no demo) exists: {augmented_no_demo_path.exists()}")
print(f"Augmented (with demo) exists: {augmented_with_demo_path.exists()}")

Base data exists: True
Augmented (no demo) exists: True
Augmented (with demo) exists: True


In [3]:
# Load data 
df_base = pd.read_csv(base_data_path)
print(f"Base dataset: {len(df_base):,} rows, {df_base['diseases'].nunique()} diseases")

df_augmented = pd.read_csv(augmented_no_demo_path)
print(f"Augmented dataset: {len(df_augmented):,} rows, {df_augmented['diseases'].nunique()} diseases")

df_demo = pd.read_csv(augmented_with_demo_path)
print(f"Augmented + Demographics: {len(df_demo):,} rows, {df_demo['diseases'].nunique()} diseases")

Base dataset: 206,267 rows, 627 diseases
Augmented dataset: 207,518 rows, 627 diseases
Augmented + Demographics: 207,518 rows, 627 diseases


In [15]:
# REMOVED: sanitize_column_names function
# LightGBM works fine with spaces in column names.
# This function was overly aggressive and broke symptom mapping.
print("Sanitization disabled - using natural symptom names with spaces")


Sanitization disabled - using natural symptom names with spaces


In [4]:
def filter_min_samples(df, min_samples=2):
    """Filter to keep only diseases with at least min_samples."""
    disease_counts = df['diseases'].value_counts()
    valid_diseases = disease_counts[disease_counts >= min_samples].index
    df_filtered = df[df['diseases'].isin(valid_diseases)].copy()
    
    print(f"Original: {len(df):,} rows, {df['diseases'].nunique()} diseases")
    print(f"Filtered: {len(df_filtered):,} rows, {df_filtered['diseases'].nunique()} diseases")
    print(f"Removed: {df['diseases'].nunique() - df_filtered['diseases'].nunique()} diseases with <{min_samples} samples")
    
    return df_filtered

# Filter for stable evaluation
df_base_filtered = filter_min_samples(df_base, min_samples=5)
df_augmented_filtered = filter_min_samples(df_augmented, min_samples=5)
df_demo_filtered = filter_min_samples(df_demo, min_samples=5)

Original: 206,267 rows, 627 diseases
Filtered: 206,173 rows, 588 diseases
Removed: 39 diseases with <5 samples
Original: 207,518 rows, 627 diseases
Filtered: 207,492 rows, 616 diseases
Removed: 11 diseases with <5 samples
Original: 207,518 rows, 627 diseases
Filtered: 207,492 rows, 616 diseases
Removed: 11 diseases with <5 samples


---
# Part 2: Define Hierarchical Model Utilities

We need to replicate the 2-stage inference process:
1. **Category Classifier** identifies likely disease categories
2. **Specialist Classifiers** predict specific diseases within those categories

In [8]:
def prepare_features(df, feature_cols):
    """Prepare features and labels from dataframe."""
    available_cols = [c for c in feature_cols if c in df.columns]
    # Return DataFrame to preserve column names for feature importance/LGBM
    X = df[available_cols]
    y_disease = df['diseases'].values
    return X, y_disease, available_cols

from joblib import Parallel, delayed
import multiprocessing

def _train_specialist(category, X_train, y_train, y_categories):
    mask = y_categories == category
    X_cat = X_train[mask]
    y_cat = y_train[mask]

    if len(y_cat) < 5:
        return None, None

    model = SymptomDiseaseClassifier(category=category, n_estimators=100)
    model.fit(X_cat, y_cat)
    return category, model


def train_hierarchical_model(X_train, y_train, df_train_full, n_jobs=None):
    """Train hierarchical model (Category -> Specialist Disease Classifiers)."""

    if n_jobs is None:
        n_jobs = max(1, multiprocessing.cpu_count() - 1)

    # 1. Train Category Classifier (keep serial)
    print("  Training Category Classifier...")
    y_categories = df_train_full['disease_category'].values

    cat_encoder = LabelEncoder()
    y_cat_encoded = cat_encoder.fit_transform(y_categories)

    cat_clf = SymptomCategoryClassifier(n_estimators=100)
    cat_clf.fit(X_train, y_cat_encoded)
    cat_clf.encoder_ = cat_encoder

    # 2. Train specialist models (PARALLEL)
    unique_categories = np.unique(y_categories)
    print(f"  Training {len(unique_categories)} Specialist Models (parallel, n_jobs={n_jobs})...")

    results = Parallel(n_jobs=n_jobs, backend="loky", verbose=0)(
        delayed(_train_specialist)(cat, X_train, y_train, y_categories)
        for cat in unique_categories
    )

    specialist_models = {
        cat: model for cat, model in results if model is not None
    }

    return cat_clf, specialist_models

def predict_hierarchical(X_test, cat_clf, specialist_models, all_possible_diseases):
    """Hierarchical prediction logic."""
    # Get category probabilities
    cat_probs = cat_clf.predict_proba(X_test)
    cat_encoder = cat_clf.encoder_
    
    # Initialize output matrix
    final_probs = np.zeros((len(X_test), len(all_possible_diseases)))
    disease_to_idx = {d: i for i, d in enumerate(all_possible_diseases)}
    
    # Route predictions
    # cat_clf.categories consists of the sorted integer labels from training
    for i, cat_idx in enumerate(cat_clf.categories):
        # Decode to string name to find correct specialist
        cat_name = cat_encoder.inverse_transform([cat_idx])[0]
        
        if cat_name not in specialist_models: continue
        
        model = specialist_models[cat_name]
        # Predict only for this category's diseases
        specialist_probs = model.predict_proba(X_test)
        
        # Weight by P(Category)
        weight = cat_probs[:, i][:, np.newaxis]
        
        # Add weighted probs to final matrix
        for local_idx, disease in enumerate(model.diseases):
            if disease in disease_to_idx:
                global_idx = disease_to_idx[disease]
                final_probs[:, global_idx] += weight.ravel() * specialist_probs[:, local_idx]
                
    return final_probs

def evaluate_hierarchical(X_train, X_test, y_train, y_test, df_train_full, all_disease_classes):
    """Train and evaluate using hierarchical approach."""
    # Train
    cat_clf, specialist_models = train_hierarchical_model(X_train, y_train, df_train_full)
    
    # Predict
    y_proba = predict_hierarchical(X_test, cat_clf, specialist_models, all_disease_classes)
    
    # Map true labels to indices
    y_pred_idx = np.argmax(y_proba, axis=1)
    # We need to map y_test strings to indices in all_disease_classes
    disease_to_idx = {d: i for i, d in enumerate(all_disease_classes)}
    y_test_idx = np.array([disease_to_idx.get(d, -1) for d in y_test])
    
    # Filter out valid labels (in case test set has disease not in training classes, though unlikely with split)
    valid_mask = y_test_idx != -1
    
    all_labels = np.arange(len(all_disease_classes))
    
    results = {
        'Top-1': accuracy_score(y_test_idx[valid_mask], y_pred_idx[valid_mask]),
        'Top-3': top_k_accuracy_score(y_test_idx[valid_mask], y_proba[valid_mask], k=3, labels=all_labels),
        'Top-5': top_k_accuracy_score(y_test_idx[valid_mask], y_proba[valid_mask], k=5, labels=all_labels),
        'Macro-F1': f1_score(y_test_idx[valid_mask], y_pred_idx[valid_mask], average='macro')
    }
    
    return results, cat_clf, specialist_models, y_proba



In [18]:
# Define feature columns (same for all)
non_feature_cols = ['diseases', 'disease_category', 'symptoms', 'age', 'sex', 'age_normalized', 'sex_encoded']
feature_cols_base = [c for c in df_base.columns if c not in non_feature_cols]
print(f"Feature columns: {len(feature_cols_base)}")

Feature columns: 375


---
# Part 3: Evaluation on Base/Real Data Only

In [19]:
print("="*70)
print("EVALUATION ON BASE/REAL DATA ONLY (Hierarchical)")
print("="*70)

X_base, y_base, cols_base = prepare_features(df_base_filtered, feature_cols_base)
all_diseases_base = np.unique(y_base)

# Split indices
indices = np.arange(len(df_base_filtered))
train_idx, test_idx = train_test_split(
    indices, test_size=0.1, random_state=42, stratify=y_base
)

X_train_base = X_base.iloc[train_idx]
X_test_base = X_base.iloc[test_idx]
y_train_base = y_base[train_idx]
y_test_base = y_base[test_idx]
df_train_base = df_base_filtered.iloc[train_idx]

results_base, cat_clf_base, _, _ = evaluate_hierarchical(
    X_train_base, X_test_base, y_train_base, y_test_base, df_train_base, all_diseases_base
)

print(f"\nResults on Real Data Only:")
for metric, value in results_base.items():
    print(f"  {metric}: {value*100:.2f}%")

EVALUATION ON BASE/REAL DATA ONLY (Hierarchical)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (185555, 375)
  Training 17 Specialist Models...
Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2486, 375)
Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (16633, 375)
Training SymptomDiseaseClassifier for category 'Dermatological' with shape (15106, 375)
Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (6882, 375)
Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (22510, 375)
Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (139, 375)
Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (24769, 375)
Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (4021, 375)
Training SymptomDiseaseClassifier

---
# Part 4: Evaluation on Augmented Data

In [20]:
print("="*70)
print("EVALUATION ON AUGMENTED DATA (Hierarchical)")
print("="*70)

feature_cols_aug = [c for c in df_augmented_filtered.columns if c not in non_feature_cols]
X_aug, y_aug, cols_aug = prepare_features(df_augmented_filtered, feature_cols_aug)
all_diseases_aug = np.unique(y_aug)

indices_aug = np.arange(len(df_augmented_filtered))
train_idx_aug, test_idx_aug = train_test_split(
    indices_aug, test_size=0.1, random_state=42, stratify=y_aug
)

X_train_aug = X_aug.iloc[train_idx_aug]
X_test_aug = X_aug.iloc[test_idx_aug]
y_train_aug = y_aug[train_idx_aug]
y_test_aug = y_aug[test_idx_aug]
df_train_aug = df_augmented_filtered.iloc[train_idx_aug]

results_aug, cat_clf_aug, _, _ = evaluate_hierarchical(
    X_train_aug, X_test_aug, y_train_aug, y_test_aug, df_train_aug, all_diseases_aug
)

print(f"\nResults on Augmented Data:")
for metric, value in results_aug.items():
    print(f"  {metric}: {value*100:.2f}%")

EVALUATION ON AUGMENTED DATA (Hierarchical)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (186742, 456)
  Training 17 Specialist Models...
Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2486, 456)
Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (16741, 456)
Training SymptomDiseaseClassifier for category 'Dermatological' with shape (15151, 456)
Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (7148, 456)
Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (22590, 456)
Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (139, 456)
Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (24814, 456)
Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (4180, 456)
Training SymptomDiseaseClassifier for 

In [21]:
print("="*70)
print("EVALUATION ON AUGMENTED DATA + Demographics (Hierarchical)")
print("="*70)

feature_cols_demo = [c for c in df_demo_filtered.columns if c not in non_feature_cols]
X_demo, y_demo, cols_demo = prepare_features(df_demo_filtered, feature_cols_demo)
all_diseases_demo = np.unique(y_demo)

indices_demo = np.arange(len(df_demo_filtered))
train_idx_demo, test_idx_demo = train_test_split(
    indices_demo, test_size=0.1, random_state=42, stratify=y_demo
)

X_train_demo = X_demo.iloc[train_idx_demo]
X_test_demo = X_demo.iloc[test_idx_demo]
y_train_demo = y_demo[train_idx_demo]
y_test_demo = y_demo[test_idx_demo]
df_train_demo = df_demo_filtered.iloc[train_idx_demo]

results_demo, cat_clf_demo, _, _ = evaluate_hierarchical(
    X_train_demo, X_test_demo, y_train_demo, y_test_demo, df_train_demo, all_diseases_demo
)

print(f"\nResults on Augmented + Demographics Data:")
for metric, value in results_demo.items():
    print(f"  {metric}: {value*100:.2f}%")

EVALUATION ON AUGMENTED DATA + Demographics (Hierarchical)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (186742, 456)
  Training 17 Specialist Models...
Training SymptomDiseaseClassifier for category 'Allergy and Immunology' with shape (2486, 456)
Training SymptomDiseaseClassifier for category 'Cardiovascular and Circulatory' with shape (16741, 456)
Training SymptomDiseaseClassifier for category 'Dermatological' with shape (15151, 456)
Training SymptomDiseaseClassifier for category 'Endocrine and Metabolic' with shape (7148, 456)
Training SymptomDiseaseClassifier for category 'Gastrointestinal and Hepatic' with shape (22590, 456)
Training SymptomDiseaseClassifier for category 'Genetic and Congenital Disorders' with shape (139, 456)
Training SymptomDiseaseClassifier for category 'Genitourinary and Reproductive' with shape (24814, 456)
Training SymptomDiseaseClassifier for category 'Hematology and Oncology' with shape (4180, 456)
Training SymptomDisease

In [22]:
# Comparison summary
print("\n" + "="*70)
print("COMPARISON: Real Data vs Augmented Data (Hierarchical)")
print("="*70)

comparison_df = pd.DataFrame({
    'Metric': list(results_base.keys()),
    'Real Only': [f"{v*100:.2f}%" for v in results_base.values()],
    'Augmented': [f"{v*100:.2f}%" for v in results_aug.values()],
    'Difference': [f"{(results_aug[k] - results_base[k])*100:+.2f}%" for k in results_base.keys()]
})

print(comparison_df.to_string(index=False))


COMPARISON: Real Data vs Augmented Data (Hierarchical)
  Metric Real Only Augmented Difference
   Top-1    81.87%    82.21%     +0.35%
   Top-3    94.26%    94.32%     +0.07%
   Top-5    96.68%    96.73%     +0.05%
Macro-F1    69.99%    71.96%     +1.97%


---
# Part 5: 5-Fold Cross-Validation

In [23]:
# --- Resume-aware CV loop (fixed version) ---
def cross_validate(X, y, df_full, disease_classes, n_folds=5, cv_name='default'):
    """
    Perform stratified k-fold cross-validation with Hierarchical model.
    
    Supports resuming from checkpoint: completed folds and their results
    are saved to disk after each fold.
    
    Args:
        cv_name: Unique identifier for this CV run (e.g., 'base', 'augmented').
                 Used to create separate checkpoint files for different runs.
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Load any previously completed folds
    cv_state = load_cv_state(cv_name)
    fold_results_dict = cv_state.get('fold_results', {})
    
    indices = np.arange(len(y))
    
    for fold, (train_idx_cv, test_idx_cv) in enumerate(skf.split(indices, y)):
        # Skip if already completed
        if fold in fold_results_dict:
            print(f"  Fold {fold+1}: SKIPPED (already completed, Top-1={fold_results_dict[fold]['Top-1']*100:.2f}%)")
            continue
        
        # Train and evaluate this fold
        X_train_cv = X.iloc[train_idx_cv]
        X_test_cv = X.iloc[test_idx_cv]
        y_train_cv = y[train_idx_cv]
        y_test_cv = y[test_idx_cv]
        df_train_cv = df_full.iloc[train_idx_cv]
        
        results, _, _, _ = evaluate_hierarchical(
            X_train_cv, X_test_cv, y_train_cv, y_test_cv, df_train_cv, disease_classes
        )
        
        # Save fold results immediately after completion
        fold_results_dict[fold] = results
        cv_state['fold_results'] = fold_results_dict
        save_cv_state(cv_name, cv_state)
        
        print(f"  Fold {fold+1}: Top-1={results['Top-1']*100:.2f}% (saved)")
    
    # Aggregate ALL results (including restored ones)
    if len(fold_results_dict) == 0:
        raise ValueError("No fold results available!")
    
    fold_results = [fold_results_dict[i] for i in sorted(fold_results_dict.keys())]
    
    summary = {}
    for metric in fold_results[0].keys():
        values = [r[metric] for r in fold_results]
        summary[metric] = {
            'mean': np.mean(values),
            'std': np.std(values)
        }
    
    print(f"  CV complete: {len(fold_results)} folds aggregated")
    return summary


In [26]:
# 5-Fold CV on BASE data
print("="*70)
print("5-FOLD CV ON BASE/REAL DATA")
print("="*70)

cv_results_base = cross_validate(X_base, y_base, df_base_filtered, all_diseases_base, cv_name='base')

print(f"\nCross-Validation Results (Real Data):")
for metric, stats in cv_results_base.items():
    print(f"  {metric}: {stats['mean']*100:.2f}% ¬± {stats['std']*100:.2f}%")

5-FOLD CV ON BASE/REAL DATA
  Loaded checkpoint for 'base': 4 folds completed
  Fold 1: SKIPPED (already completed, Top-1=82.09%)
  Fold 2: SKIPPED (already completed, Top-1=81.91%)
  Fold 3: SKIPPED (already completed, Top-1=81.54%)
  Fold 4: SKIPPED (already completed, Top-1=81.99%)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (164939, 375)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 5: Top-1=81.71% (saved)
  CV complete: 5 folds aggregated

Cross-Validation Results (Real Data):
  Top-1: 81.85% ¬± 0.20%
  Top-3: 94.23% ¬± 0.07%
  Top-5: 96.68% ¬± 0.05%
  Macro-F1: 69.94% ¬± 0.25%


In [27]:
# 5-Fold CV on AUGMENTED data
print("="*70)
print("5-FOLD CV ON AUGMENTED DATA")
print("="*70)

cv_results_aug = cross_validate(X_aug, y_aug, df_augmented_filtered, all_diseases_aug, cv_name='augmented')

print(f"\nCross-Validation Results (Augmented):")
for metric, stats in cv_results_aug.items():
    print(f"  {metric}: {stats['mean']*100:.2f}% ¬± {stats['std']*100:.2f}%")

5-FOLD CV ON AUGMENTED DATA
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165993, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 1: Top-1=81.71% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165993, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 2: Top-1=81.50% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 3: Top-1=81.62% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 4: Top-1=81.66% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 5: Top-1=81.63% (saved)
  CV complete: 5 folds aggregated

Cross-Validation Results (Augmen

---
# Part 6: Ablation (Demographics)

In [28]:
# Demographics: Data Preparation + 5-Fold Cross-Validation
print("="*70)
print("5-FOLD CV ON DEMOGRAPHICS DATA")
print("="*70)

# Prepare demographic features
df_demo_filtered['sex_encoded'] = (df_demo_filtered['sex'] == 'M').astype(int)
df_demo_filtered['age_normalized'] = df_demo_filtered['age'] / 100.0

feature_cols_demo = [c for c in df_augmented_filtered.columns if c not in non_feature_cols] + ['age_normalized', 'sex_encoded']
feature_cols_demo = [c for c in feature_cols_demo if c in df_demo_filtered.columns]

X_demo = df_demo_filtered[feature_cols_demo]
y_demo = df_demo_filtered['diseases'].values
all_diseases_demo = np.unique(y_demo)

# 5-Fold Cross-Validation on Demographics data
cv_results_demo = cross_validate(X_demo, y_demo, df_demo_filtered, all_diseases_demo, cv_name='demographics')

print(f"\nCross-Validation Results (With Demographics):")
for metric, stats in cv_results_demo.items():
    print(f"  {metric}: {stats['mean']*100:.2f}% ¬± {stats['std']*100:.2f}%")

# Demographics contribution (comparing CV results)
print(f"\nüìä Demographics Contribution (5-Fold CV):")
print(f"  Augmented (no demo) Top-1: {cv_results_aug['Top-1']['mean']*100:.2f}% ¬± {cv_results_aug['Top-1']['std']*100:.2f}%")
print(f"  With Demographics Top-1:   {cv_results_demo['Top-1']['mean']*100:.2f}% ¬± {cv_results_demo['Top-1']['std']*100:.2f}%")
demo_contribution = (cv_results_demo['Top-1']['mean'] - cv_results_aug['Top-1']['mean']) * 100
print(f"  Demographics Effect:       {demo_contribution:+.2f}%")

5-FOLD CV ON DEMOGRAPHICS DATA
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165993, 458)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 1: Top-1=84.06% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165993, 458)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 2: Top-1=84.10% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 458)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 3: Top-1=84.04% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 458)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 4: Top-1=84.11% (saved)
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (165994, 458)
  Training 17 Specialist Models (parallel, n_jobs=3)...
  Fold 5: Top-1=84.05% (saved)
  CV complete: 5 folds aggregated

Cross-Validation Results (Wit

In [29]:
print("\n" + "="*70)
print("ABLATION STUDY SUMMARY (5-Fold Cross-Validation)")
print("="*70)

ablation_cv_df = pd.DataFrame({
    'Configuration': [
        'Real Data Only', 
        'Augmented (no demo)', 
        'Augmented + Demographics'
    ],
    'Top-1 Mean': [
        f"{cv_results_base['Top-1']['mean']*100:.2f}%",
        f"{cv_results_aug['Top-1']['mean']*100:.2f}%",
        f"{cv_results_demo['Top-1']['mean']*100:.2f}%"
    ],
    'Top-1 Std': [
        f"¬±{cv_results_base['Top-1']['std']*100:.2f}%",
        f"¬±{cv_results_aug['Top-1']['std']*100:.2f}%",
        f"¬±{cv_results_demo['Top-1']['std']*100:.2f}%"
    ],
    'Top-5 Mean': [
        f"{cv_results_base['Top-5']['mean']*100:.2f}%",
        f"{cv_results_aug['Top-5']['mean']*100:.2f}%",
        f"{cv_results_demo['Top-5']['mean']*100:.2f}%"
    ]
})

print(ablation_cv_df.to_string(index=False))


ABLATION STUDY SUMMARY (5-Fold Cross-Validation)
           Configuration Top-1 Mean Top-1 Std Top-5 Mean
          Real Data Only     81.85%    ¬±0.20%     96.68%
     Augmented (no demo)     81.62%    ¬±0.07%     96.54%
Augmented + Demographics     84.07%    ¬±0.03%     97.34%


---
# Part 7: Error Analysis

In [30]:
# Re-run prediction to get confusion matrix from Hierarchical model (Base Data)
print("Generating confusion matrix for Real Data...")

cat_clf_err, specialist_err = train_hierarchical_model(X_train_base, y_train_base, df_train_base)
y_proba_err = predict_hierarchical(X_test_base, cat_clf_err, specialist_err, all_diseases_base)

# Map to indices
y_pred_err_idx = np.argmax(y_proba_err, axis=1)
disease_to_idx = {d: i for i, d in enumerate(all_diseases_base)}
y_test_err_idx = np.array([disease_to_idx.get(d, -1) for d in y_test_base])

# Filter valid only
valid = y_test_err_idx != -1
cm = confusion_matrix(y_test_err_idx[valid], y_pred_err_idx[valid])

print(f"Confusion matrix shape: {cm.shape}")

Generating confusion matrix for Real Data...
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (185555, 375)
  Training 17 Specialist Models (parallel, n_jobs=3)...
Confusion matrix shape: (584, 584)


In [31]:
# Find most confused pairs
def find_confusion_pairs(cm, disease_names, top_k=10):
    """Find most confused disease pairs."""
    n = cm.shape[0]
    pairs = []
    
    for i in range(n):
        for j in range(n):
            if i != j and cm[i, j] > 0:
                pairs.append({
                    'true': disease_names[i],
                    'pred': disease_names[j],
                    'count': cm[i, j]
                })
    
    pairs.sort(key=lambda x: -x['count'])
    return pairs[:top_k]

confused_pairs = find_confusion_pairs(cm, all_diseases_base, top_k=15)

print("Top 15 Most Confused Disease Pairs:")
print("-" * 80)
for p in confused_pairs:
    print(f"  {p['count']:3d}x: {p['true'][:35]:35s} ‚Üí {p['pred'][:35]}")

Top 15 Most Confused Disease Pairs:
--------------------------------------------------------------------------------
   59x: neurosis                            ‚Üí impulse control disorder
   37x: syphilis                            ‚Üí bladder disorder
   23x: cholecystitis                       ‚Üí galactorrhea of unknown cause
   20x: uterine cancer                      ‚Üí testicular torsion
   19x: plantar fasciitis                   ‚Üí acute bronchospasm
   18x: drug withdrawal                     ‚Üí corneal disorder
   17x: chronic glaucoma                    ‚Üí foreign body in the ear
   17x: cysticercosis                       ‚Üí syphilis
   17x: dementia                            ‚Üí polycystic ovarian syndrome (pcos)
   17x: osteoporosis                        ‚Üí acute otitis media
   16x: sarcoidosis                         ‚Üí pseudotumor cerebri
   14x: acute pancreatitis                  ‚Üí galactorrhea of unknown cause
   14x: heart failure                      

In [32]:
# Per-class accuracy
def per_class_accuracy(y_true_idx, y_pred_idx, disease_names, top_worst=10):
    """Calculate per-class accuracy."""
    accuracies = []
    
    for i, cls in enumerate(disease_names):
        mask = y_true_idx == i
        if mask.sum() > 0:
            acc = (y_pred_idx[mask] == i).mean()
            accuracies.append((cls, acc, mask.sum()))
    
    accuracies.sort(key=lambda x: x[1])
    return accuracies[:top_worst]

worst_classes = per_class_accuracy(y_test_err_idx[valid], y_pred_err_idx[valid], all_diseases_base, top_worst=15)

print("\nWorst Performing Diseases (Lowest Accuracy):")
print("-" * 60)
for cls, acc, count in worst_classes:
    print(f"  {acc*100:5.1f}% ({count:3d} samples): {cls}")




Worst Performing Diseases (Lowest Accuracy):
------------------------------------------------------------
    0.0% (  2 samples): abscess of the lung
    0.0% (  4 samples): acariasis
    0.0% (  1 samples): acute fatty liver of pregnancy (aflp)
    0.0% (  4 samples): anemia of chronic disease
    0.0% (  1 samples): avascular necrosis
    0.0% (  1 samples): blepharospasm
    0.0% (  1 samples): breast cancer
    0.0% (  2 samples): breast cyst
    0.0% (  1 samples): chronic kidney disease
    0.0% (  1 samples): cushing syndrome
    0.0% (  2 samples): cyst of the eyelid
    0.0% (  1 samples): cystic fibrosis
    0.0% (  4 samples): ectropion
    0.0% (  1 samples): edward syndrome
    0.0% (  1 samples): empyema


---
# Part 8: Save Final Results

In [33]:
# Save results to JSON
final_results = {
    'real_only': results_base,
    'augmented': results_aug,
    'augmented_demo': results_demo,
    'cv_real': {k: {'mean': v['mean'], 'std': v['std']} for k, v in cv_results_base.items()},
    'cv_augmented': {k: {'mean': v['mean'], 'std': v['std']} for k, v in cv_results_aug.items()}
}

with open(project_root / 'notebooks' / 'figures' / 'rigorous_eval_results.json', 'w') as f:
    json.dump(final_results, f, indent=2, default=float)

print("Results saved to figures/rigorous_eval_results.json")

# Print final summary table
print("\n" + "="*70)
print("FINAL RESULTS SUMMARY (Hierarchical Model)")
print("="*70)
print(ablation_cv_df.to_string(index=False))
print("\nCross-Validation (Real): ", f"{cv_results_base['Top-1']['mean']*100:.2f}% ¬± {cv_results_base['Top-1']['std']*100:.2f}%")

Results saved to figures/rigorous_eval_results.json

FINAL RESULTS SUMMARY (Hierarchical Model)
           Configuration Top-1 Mean Top-1 Std Top-5 Mean
          Real Data Only     81.85%    ¬±0.20%     96.68%
     Augmented (no demo)     81.62%    ¬±0.07%     96.54%
Augmented + Demographics     84.07%    ¬±0.03%     97.34%

Cross-Validation (Real):  81.85% ¬± 0.20%


---
# Part 9: Semantic Encoder Evaluation

Evaluates how well the `SemanticSymptomEncoder` converts raw symptom text into the 377-dim evidence vector.

**Metrics:**
- **Precision@K**: Of top-K activated symptoms, how many are ground truth?
- **Recall@K**: Of ground truth symptoms, how many are in top-K?
- **MRR**: Mean Reciprocal Rank of ground truth symptoms


In [34]:
import importlib
import models.architectures.semantic_symptom_encoder as sse_module
importlib.reload(sse_module)
from models.architectures.semantic_symptom_encoder import SemanticSymptomEncoder

In [35]:
# =============================================================================
# CELL: Multi-Model + Calibration Tuning (Using YOUR SemanticSymptomEncoder)
# =============================================================================
import numpy as np
import json
import random
import time
from itertools import product

CANDIDATE_MODELS = [
    "multi-qa-mpnet-base-dot-v1",
    "all-mpnet-base-v2",
    "all-MiniLM-L12-v2",
    "paraphrase-mpnet-base-v2",
    "paraphrase-MiniLM-L6-v2",
    "sentence-transformers/msmarco-distilbert-cos-v5",
]

THRESHOLDS = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45]
EXPONENTS = [1.0, 1.25, 1.5, 1.75, 2.0]

# Load paraphrases
paraphrase_path = project_root / "data" / "symptom_paraphrases.json"
with open(paraphrase_path, 'r', encoding='utf-8') as f:
    paraphrases = json.load(f)

def evaluate_encoder(encoder, n_tests=200):
    """Evaluate YOUR encoder using paraphrases."""
    testable = [s for s in encoder.symptoms if s in paraphrases]
    random.seed(42)
    
    precision_5, recall_5, mrr = [], [], []
    
    for _ in range(n_tests):
        n = random.randint(2, 5)
        gt_symptoms = set(random.sample(testable, min(n, len(testable))))
        
        phrases = [random.choice(paraphrases[s]) for s in gt_symptoms]
        text = ". ".join(phrases)
        
        # Use YOUR encoder's methods
        result = encoder.encode_symptoms(text)
        top_5 = encoder.get_top_symptoms(result['symptom_vector'], top_k=5, threshold=0.0)
        top_5_names = [s[0] for s in top_5]
        
        hits = len(set(top_5_names) & gt_symptoms)
        precision_5.append(hits / 5)
        recall_5.append(hits / len(gt_symptoms))
        
        for rank, (sym, _) in enumerate(top_5, 1):
            if sym in gt_symptoms:
                mrr.append(1.0 / rank)
                break
        else:
            mrr.append(0.0)
    
    p, r = np.mean(precision_5), np.mean(recall_5)
    return {'P@5': p, 'R@5': r, 'MRR': np.mean(mrr), 'F1': 2*p*r/(p+r+1e-8)}

print("=" * 70)
print("MULTI-MODEL + CALIBRATION TUNING (Using YOUR SemanticSymptomEncoder)")
print(f"Testing {len(CANDIDATE_MODELS)} models √ó {len(THRESHOLDS)} thresholds √ó {len(EXPONENTS)} exponents")
print("=" * 70)

all_results = []

for model_name in CANDIDATE_MODELS:
    print(f"\n‚ñ∂ Testing: {model_name}")
    
    best_score, best_params, best_metrics = 0, (0.25, 1.5), {}
    
    try:
        start = time.time()
        
        for thresh, exp in product(THRESHOLDS, EXPONENTS):
            # Create YOUR encoder with this model + calibration
            encoder = SemanticSymptomEncoder(
                model_name=model_name,
                device='cpu',
                threshold=thresh,
                exponent=exp
            )
            
            metrics = evaluate_encoder(encoder, n_tests=150)
            
            if metrics['F1'] > best_score:
                best_score = metrics['F1']
                best_params = (thresh, exp)
                best_metrics = metrics
            
            del encoder
        
        elapsed = time.time() - start
        
        all_results.append({
            'model': model_name,
            'threshold': best_params[0],
            'exponent': best_params[1],
            **best_metrics,
            'time': elapsed
        })
        
        print(f"   Best: threshold={best_params[0]:.2f}, exponent={best_params[1]:.2f}")
        print(f"   P@5: {best_metrics['P@5']*100:.1f}% | R@5: {best_metrics['R@5']*100:.1f}% | F1: {best_metrics['F1']*100:.1f}%")
        
    except Exception as e:
        print(f"   ‚ùå Failed: {e}")

# Summary
print("\n" + "=" * 70)
print("RESULTS (sorted by F1)")
print("=" * 70)
results_sorted = sorted(all_results, key=lambda x: x['F1'], reverse=True)
print(f"{'Model':<42} {'Thresh':>6} {'Exp':>5} {'P@5':>6} {'R@5':>6} {'F1':>6}")
print("-" * 75)
for r in results_sorted:
    print(f"{r['model']:<42} {r['threshold']:>6.2f} {r['exponent']:>5.2f} {r['P@5']*100:>5.1f}% {r['R@5']*100:>5.1f}% {r['F1']*100:>5.1f}%")

BEST = results_sorted[0]
print(f"\nüèÜ BEST: {BEST['model']} (threshold={BEST['threshold']}, exponent={BEST['exponent']})")

MULTI-MODEL + CALIBRATION TUNING (Using YOUR SemanticSymptomEncoder)
Testing 6 models √ó 7 thresholds √ó 5 exponents

‚ñ∂ Testing: multi-qa-mpnet-base-dot-v1
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Cached embeddings mismatch, recomputing
[Encoder] Computing symptom embeddings (one-time)


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:42<00:00,  2.84s/it]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: multi-qa-mpnet-base-dot-v1
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loadi

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:43<00:00,  2.87s/it]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom emb

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:14<00:00,  1.02it/s]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: all-MiniLM-L12-v2
[Encoder] Loaded cached symptom emb

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:41<00:00,  2.77s/it]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: para

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:07<00:00,  1.91it/s]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-MiniLM-L6-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: paraphrase-

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:21<00:00,  1.45s/it]


[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model: sentence-transformers/msmarco-distilbert-cos-v5
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
[Encoder] Loading model:

In [36]:
# =============================================================================
# CELL: Detailed Evaluation with Best Configuration
# =============================================================================
print("=" * 70)
print(f"DETAILED EVALUATION: {BEST['model']}")
print("=" * 70)

# Create encoder with BEST settings
encoder = SemanticSymptomEncoder(
    model_name=BEST['model'],
    device='cpu',
    threshold=BEST['threshold'],
    exponent=BEST['exponent']
)

# Large-scale evaluation on BASE data
print("\nüìä Evaluation on BASE Data (3000 samples):")

sample_df = df_base_filtered.sample(n=3000, random_state=42)
precision_5, recall_5, precision_10, recall_10, mrr = [], [], [], [], []

for idx, row in sample_df.iterrows():
    gt_symptoms = set()
    phrases = []
    
    for col in cols_base:
        sym = col.lower().replace('_', ' ')
        if col in row and row[col] > 0.5 and sym in encoder.symptom_to_idx and sym in paraphrases:
            gt_symptoms.add(sym)
            phrases.append(random.choice(paraphrases[sym]))
    
    if len(gt_symptoms) < 2:
        continue
    
    text = ". ".join(phrases)
    result = encoder.encode_symptoms(text)
    
    top_10 = encoder.get_top_symptoms(result['symptom_vector'], top_k=10, threshold=0.0)
    top_10_names = [s[0] for s in top_10]
    top_5_names = top_10_names[:5]
    
    hits_5 = len(set(top_5_names) & gt_symptoms)
    hits_10 = len(set(top_10_names) & gt_symptoms)
    
    precision_5.append(hits_5 / 5)
    precision_10.append(hits_10 / 10)
    recall_5.append(hits_5 / len(gt_symptoms))
    recall_10.append(hits_10 / len(gt_symptoms))
    
    for rank, sym in enumerate(top_10_names, 1):
        if sym in gt_symptoms:
            mrr.append(1.0 / rank)
            break
    else:
        mrr.append(0.0)

print(f"  Samples: {len(precision_5)}")
print(f"  Precision@5:  {np.mean(precision_5)*100:.2f}%")
print(f"  Precision@10: {np.mean(precision_10)*100:.2f}%")
print(f"  Recall@5:     {np.mean(recall_5)*100:.2f}%")
print(f"  Recall@10:    {np.mean(recall_10)*100:.2f}%")
print(f"  MRR:          {np.mean(mrr):.4f}")

# Qualitative examples
print("\n" + "=" * 70)
print("QUALITATIVE EXAMPLES")
print("=" * 70)

test_inputs = [
    "my head is killing me and I feel like throwing up",
    "I can't breathe properly and my chest feels tight",
    "burning when I pee and having to go frequently",
    "I've been feeling really sad and can't sleep at night",
    "my joints ache and I'm exhausted all the time",
]

for text in test_inputs:
    result = encoder.encode_symptoms(text)
    top = encoder.get_top_symptoms(result['symptom_vector'], top_k=5, threshold=0.0)
    print(f"\nüìù \"{text}\"")
    for sym, score in top:
        print(f"   ‚Ä¢ {sym}: {score:.3f}")

DETAILED EVALUATION: all-mpnet-base-v2
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms

üìä Evaluation on BASE Data (3000 samples):
  Samples: 1846
  Precision@5:  34.08%
  Precision@10: 22.36%
  Recall@5:     60.96%
  Recall@10:    78.12%
  MRR:          0.7128

QUALITATIVE EXAMPLES

üìù "my head is killing me and I feel like throwing up"
   ‚Ä¢ vomiting: 0.197
   ‚Ä¢ nausea and vomiting: 0.169
   ‚Ä¢ nausea: 0.168
   ‚Ä¢ vomiting blood: 0.135
   ‚Ä¢ ache all over: 0.124

üìù "I can't breathe properly and my chest feels tight"
   ‚Ä¢ hurts to breath: 0.272
   ‚Ä¢ difficulty breathing: 0.255
   ‚Ä¢ breathing fast: 0.242
   ‚Ä¢ chest pain: 0.226
   ‚Ä¢ congestion in chest: 0.221

üìù "burning when I pee and having to go frequently"
   ‚Ä¢ painful urination: 0.277
   ‚Ä¢ symptoms of bladder: 0.265
   ‚Ä¢ frequent urination: 0.236
   ‚Ä¢ excessive urination at night: 0.220
   ‚Ä¢ symptoms of the kidneys: 0.1

---
# Part 10: End-to-End Pipeline Evaluation
 
Tests the full pipeline: **Symptom text ‚Üí Encoder ‚Üí Hierarchical Classifier ‚Üí Predictions**
 
Compares how using the encoder (vs pre-computed binary features) affects classification accuracy.

In [37]:
# =============================================================================
# CELL: Full Pipeline (Encoder ‚Üí Hierarchical Classifier)
# =============================================================================
print("=" * 70)
print("END-TO-END PIPELINE EVALUATION")
print("=" * 70)

sample_df = df_base_filtered.sample(n=2000, random_state=42)
y_true, y_proba_all = [], []

for idx, row in sample_df.iterrows():
    active_cols = [col for col in cols_base if col in row and row[col] > 0.5]
    phrases = []
    
    for col in active_cols:
        sym = col.lower().replace('_', ' ')
        if sym in paraphrases and paraphrases[sym]:
            phrases.append(random.choice(paraphrases[sym]))
    
    if len(phrases) < 2:
        continue
    
    text = ". ".join(phrases)
    result = encoder.encode_symptoms(text)
    
    # Build classifier features
    features = np.zeros(len(cols_base))
    for i, col in enumerate(cols_base):
        sym = col.lower().replace('_', ' ')
        features[i] = encoder.get_symptom_score(result['symptom_vector'], sym)
    
    X_row = pd.DataFrame([features], columns=cols_base)
    proba = predict_hierarchical(X_row, cat_clf_base, specialist_err, all_diseases_base)
    
    y_true.append(row['diseases'])
    y_proba_all.append(proba[0])

y_proba_all = np.array(y_proba_all)
disease_to_idx = {d: i for i, d in enumerate(all_diseases_base)}
y_true_idx = np.array([disease_to_idx.get(d, -1) for d in y_true])
valid = y_true_idx != -1
y_pred_idx = np.argmax(y_proba_all[valid], axis=1)

pipeline_top1 = accuracy_score(y_true_idx[valid], y_pred_idx)
pipeline_top5 = top_k_accuracy_score(y_true_idx[valid], y_proba_all[valid], k=5, labels=np.arange(len(all_diseases_base)))

print(f"\n  Samples: {valid.sum()}")
print(f"  Pipeline Top-1: {pipeline_top1*100:.2f}%")
print(f"  Pipeline Top-5: {pipeline_top5*100:.2f}%")

print(f"\nüìä Comparison to Binary Features:")
print(f"  Binary Top-1: {results_base['Top-1']*100:.2f}%")
print(f"  Pipeline Top-1: {pipeline_top1*100:.2f}%")
degradation = (results_base['Top-1'] - pipeline_top1) * 100
print(f"  Degradation: {degradation:.2f}%")

print("\n" + "=" * 70)
print("FINAL RECOMMENDATION")
print("=" * 70)
print(f"  Model:     {BEST['model']}")
print(f"  Threshold: {BEST['threshold']}")
print(f"  Exponent:  {BEST['exponent']}")

END-TO-END PIPELINE EVALUATION

  Samples: 1248
  Pipeline Top-1: 0.08%
  Pipeline Top-5: 0.56%

üìä Comparison to Binary Features:
  Binary Top-1: 81.87%
  Pipeline Top-1: 0.08%
  Degradation: 81.79%

FINAL RECOMMENDATION
  Model:     all-mpnet-base-v2
  Threshold: 0.15
  Exponent:  1.0


In [38]:
def extract_symptom_columns(df, non_feature_cols):
    return [
        c for c in df.columns
        if c not in non_feature_cols
    ]


cols_base = extract_symptom_columns(df_base, non_feature_cols)
cols_augmented = extract_symptom_columns(df_augmented, non_feature_cols)
cols_demo = extract_symptom_columns(df_demo, non_feature_cols)

print("Base symptoms:", len(cols_base))
print("Augmented symptoms:", len(cols_augmented))
print("Demo (demographic) features:", len(cols_demo))

Base symptoms: 375
Augmented symptoms: 456
Demo (demographic) features: 456


In [39]:
def overlap(a, b):
    return len(set(a) & set(b))

print("Base ‚à© Augmented:", overlap(cols_base, cols_augmented))
print("Base ‚à© Demo:", overlap(cols_base, cols_demo))
print("Augmented ‚à© Demo:", overlap(cols_augmented, cols_demo))

Base ‚à© Augmented: 375
Base ‚à© Demo: 375
Augmented ‚à© Demo: 456


In [40]:
# -----------------------------
# Ground truth matrices
# -----------------------------

X_base_true = (
    df_base[cols_base]
    .fillna(0)
    .astype(float)
    .values
)

X_augmented_true = (
    df_augmented[cols_augmented]
    .fillna(0)
    .astype(float)
    .values
)

X_demo = (
    df_demo[cols_demo]
    .fillna(0)
    .astype(float)
    .values
)

print("X_base_true:", X_base_true.shape)
print("X_augmented_true:", X_augmented_true.shape)
print("X_demo:", X_demo.shape)


X_base_true: (206267, 375)
X_augmented_true: (207518, 456)
X_demo: (207518, 456)


In [41]:
encoder_base = SemanticSymptomEncoder(
    model_name=BEST["model"],   # best you found
    threshold=BEST["threshold"],
    exponent=BEST["exponent"],
    symptom_list=cols_base,           # üî• dataset-derived vocab
)

assert len(encoder_base.symptoms) == len(cols_base)
print("Encoder initialized with base vocab")


[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Cached embeddings mismatch, recomputing
[Encoder] Computing symptom embeddings (one-time)


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:36<00:00,  3.02s/it]

[Encoder] Initialized with 375 symptoms
Encoder initialized with base vocab





In [None]:
def build_col_to_symptom_map(symptom_cols, encoder_symptoms):
    """
    Enforces 1:1 alignment between dataset columns and encoder vocab.
    """
    enc_set = set(encoder_symptoms)
    mapping = {}

    for col in symptom_cols:
        norm = col.lower().replace("_", " ")
        if norm in enc_set:
            mapping[col] = norm
        else:
            # fail fast ‚Äì do NOT silently ignore
            raise ValueError(f"Unmapped symptom column: {col}")

    print(f"‚úÖ Mapped {len(mapping)} symptoms")
    return mapping


col_to_symptom = build_col_to_symptom_map(
    cols_base,
    encoder.symptoms
)

def symptoms_to_text(active_symptoms):
    """
    Deterministic, controlled text generation.
    """
    return ". ".join(active_symptoms)

def evaluate_user_in_loop(
    encoder,
    X_true,                # binary symptom matrix
    y_true,                # disease labels
    symptom_cols,
    col_to_symptom,
    predict_fn,            # predict_hierarchical
    cat_clf,
    specialist_models,
    all_diseases,
    top_k=20,
    max_samples=3000
):
    idxs = np.random.choice(
        len(X_true),
        size=min(max_samples, len(X_true)),
        replace=False
    )

    y_true_idx = []
    y_pred_proba = []

    disease_to_idx = {d: i for i, d in enumerate(all_diseases)}

    for i in idxs:
        # 1. Ground truth symptoms
        active_cols = [
            col for j, col in enumerate(symptom_cols)
            if X_true[i, j] == 1.0
        ]

        if len(active_cols) < 2:
            continue

        active_syms = [col_to_symptom[c] for c in active_cols]

        # 2. Generate text
        text = symptoms_to_text(active_syms)

        # 3. Encode & rank
        evidence = encoder.encode_symptoms(text)["symptom_vector"]
        top_idx = np.argsort(evidence)[::-1][:top_k]
        top_syms = {encoder.symptoms[j] for j in top_idx}

        # 4. Simulate user confirmation
        X_sim = np.zeros(len(symptom_cols))
        for j, col in enumerate(symptom_cols):
            sym = col_to_symptom[col]
            if sym in top_syms and sym in active_syms:
                X_sim[j] = 1.0

        # 5. Predict
        proba = predict_fn(
            X_sim.reshape(1, -1),
            cat_clf,
            specialist_models,
            all_diseases
        )[0]

        y_pred_proba.append(proba)
        y_true_idx.append(disease_to_idx[y_true[i]])

    y_pred_proba = np.array(y_pred_proba)
    y_true_idx = np.array(y_true_idx)

    top1 = accuracy_score(y_true_idx, np.argmax(y_pred_proba, axis=1))
    top5 = top_k_accuracy_score(
        y_true_idx,
        y_pred_proba,
        k=5,
        labels=np.arange(len(all_diseases))
    )

    return top1, top5, len(y_true_idx)


In [None]:
print("=" * 70)
print("FINAL PIPELINE EVALUATION (USER-IN-THE-LOOP)")
print("=" * 70)

top1, top5, n = evaluate_user_in_loop(
    encoder=encoder,
    X_true=X_base_true,
    y_true=df_base_filtered["diseases"].values,
    symptom_cols=cols_base,
    col_to_symptom=col_to_symptom,
    predict_fn=predict_hierarchical,
    cat_clf=cat_clf_base,
    specialist_models=specialist_err,
    all_diseases=all_diseases_base,
    top_k=20,
    max_samples=2500
)


In [44]:
print("-" * 50)
print(f"Samples Evaluated: {n}")
print(f"Pipeline Top-1 Accuracy: {top1*100:.2f}%")
print(f"Pipeline Top-5 Accuracy: {top5*100:.2f}%")

baseline = results_base["Top-1"]
degradation = (baseline - top1) * 100

print(f"\nGold Standard Top-1: {baseline*100:.2f}%")
print(f"Degradation: {degradation:.2f}%")

if degradation < 5:
    print("‚úÖ EXCELLENT")
elif degradation < 15:
    print("‚ö†Ô∏è ACCEPTABLE")
else:
    print("‚ùå NEEDS IMPROVEMENT")


--------------------------------------------------
Samples Evaluated: 2499
Pipeline Top-1 Accuracy: 74.15%
Pipeline Top-5 Accuracy: 86.19%

Gold Standard Top-1: 81.87%
Degradation: 7.72%
‚ö†Ô∏è ACCEPTABLE


In [46]:
col_to_symptom_demo = build_col_to_symptom_map(
    cols_demo,
    encoder.symptoms
)


‚úÖ Mapped 456 symptoms


In [48]:
# Train specialist models for demographics evaluation
cat_clf_demo_err, specialist_demo = train_hierarchical_model(
    X_train_demo, y_train_demo, df_train_demo
)

  Training Category Classifier...
Training SymptomCategoryClassifier with shape (186742, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...


In [52]:
top1_demo, top5_demo, n_demo = evaluate_user_in_loop(
    encoder=encoder,
    X_true=df_demo_filtered[cols_demo].fillna(0).values,
    y_true=df_demo_filtered["diseases"].values,
    symptom_cols=cols_demo,
    col_to_symptom=col_to_symptom_demo,
    predict_fn=predict_hierarchical,
    cat_clf=cat_clf_demo,
    specialist_models=specialist_demo,
    all_diseases=all_diseases_demo,
    top_k=20,
    max_samples=2500
)

In [53]:
print("-" * 50)
print(f"Samples Evaluated: {n}")
print(f"Pipeline Top-1 Accuracy: {top1_demo*100:.2f}%")
print(f"Pipeline Top-5 Accuracy: {top5_demo*100:.2f}%")

baseline = results_base["Top-1"]
degradation = (baseline - top1_demo) * 100

print(f"\nGold Standard Top-1: {baseline*100:.2f}%")
print(f"Degradation: {degradation:.2f}%")

if degradation < 5:
    print("‚úÖ EXCELLENT")
elif degradation < 15:
    print("‚ö†Ô∏è ACCEPTABLE")
else:
    print("‚ùå NEEDS IMPROVEMENT")

--------------------------------------------------
Samples Evaluated: 2499
Pipeline Top-1 Accuracy: 83.81%
Pipeline Top-5 Accuracy: 97.64%

Gold Standard Top-1: 81.87%
Degradation: -1.95%
‚úÖ EXCELLENT


In [54]:
# === Save ALL Encoder & Pipeline Results ===
results_path = project_root / 'notebooks' / 'figures' / 'rigorous_eval_results.json'

# Load existing CV results
with open(results_path, 'r') as f:
    existing = json.load(f)

# Add encoder comparison + full pipeline results
existing['encoder_comparison'] = {
    'all_results': all_results,           # 6 models √ó grid search
    'best_config': BEST,                  # Best model/threshold/exponent
}

existing['pipeline_user_in_loop'] = {
    'base_dataset': {
        'top1': top1,
        'top5': top5,
        'samples': n,
        'gold_standard_top1': results_base['Top-1'],
        'degradation': (results_base['Top-1'] - top1) * 100
    },
    'augmented + demographics': {
        'top1': top1_demo,
        'top5': top5_demo,
        'samples': n_demo,
        'gold_standard_top1': results_base['Top-1'],
        'degradation': (results_base['Top-1'] - top1_demo) * 100
    }
}

# Save
with open(results_path, 'w') as f:
    json.dump(existing, f, indent=2, default=float)

print(f"‚úÖ All results saved to: {results_path}")
print(f"   - CV results (base, augmented, demographics)")
print(f"   - Encoder comparison ({len(all_results)} configs tested)")
print(f"   - Pipeline evaluation (user-in-the-loop)")

‚úÖ All results saved to: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\notebooks\figures\rigorous_eval_results.json
   - CV results (base, augmented, demographics)
   - Encoder comparison (6 configs tested)
   - Pipeline evaluation (user-in-the-loop)


In [11]:
# Pre-populate with completed results
quick_results = [
    {'threshold': 0.10, 'exponent': 0.5, 'top1': 0.823},
    {'threshold': 0.10, 'exponent': 1.0, 'top1': 0.838},
    {'threshold': 0.10, 'exponent': 1.5, 'top1': 0.816},
    {'threshold': 0.10, 'exponent': 2.0, 'top1': 0.846},
    {'threshold': 0.10, 'exponent': 2.5, 'top1': 0.850},
    {'threshold': 0.15, 'exponent': 0.5, 'top1': 0.864},
    {'threshold': 0.15, 'exponent': 1.0, 'top1': 0.820},
    {'threshold': 0.15, 'exponent': 1.5, 'top1': 0.844},
    {'threshold': 0.15, 'exponent': 2.0, 'top1': 0.815},
    {'threshold': 0.15, 'exponent': 2.5, 'top1': 0.856},
    {'threshold': 0.20, 'exponent': 0.5, 'top1': 0.850},
    {'threshold': 0.20, 'exponent': 1.0, 'top1': 0.844},
    {'threshold': 0.20, 'exponent': 1.5, 'top1': 0.827},
    {'threshold': 0.20, 'exponent': 2.0, 'top1': 0.789},
    {'threshold': 0.20, 'exponent': 2.5, 'top1': 0.832},
    {'threshold': 0.25, 'exponent': 0.5, 'top1': 0.823},
    {'threshold': 0.25, 'exponent': 1.0, 'top1': 0.829},
    {'threshold': 0.25, 'exponent': 1.5, 'top1': 0.836},
    {'threshold': 0.25, 'exponent': 2.0, 'top1': 0.808},
    {'threshold': 0.25, 'exponent': 2.5, 'top1': 0.802},
    {'threshold': 0.30, 'exponent': 0.5, 'top1': 0.824},
    {'threshold': 0.30, 'exponent': 1.0, 'top1': 0.842},
    {'threshold': 0.30, 'exponent': 1.5, 'top1': 0.852},
    {'threshold': 0.30, 'exponent': 2.0, 'top1': 0.833},
    {'threshold': 0.30, 'exponent': 2.5, 'top1': 0.815},
    {'threshold': 0.35, 'exponent': 0.5, 'top1': 0.838},
    {'threshold': 0.35, 'exponent': 1.0, 'top1': 0.837},
    {'threshold': 0.35, 'exponent': 1.5, 'top1': 0.801},
    {'threshold': 0.35, 'exponent': 2.0, 'top1': 0.856},
    {'threshold': 0.35, 'exponent': 2.5, 'top1': 0.860},
    {'threshold': 0.40, 'exponent': 0.5, 'top1': 0.872},
    {'threshold': 0.40, 'exponent': 1.0, 'top1': 0.826},
    {'threshold': 0.40, 'exponent': 1.5, 'top1': 0.861},
    {'threshold': 0.40, 'exponent': 2.0, 'top1': 0.838},
    {'threshold': 0.40, 'exponent': 2.5, 'top1': 0.829},
    {'threshold': 0.45, 'exponent': 0.5, 'top1': 0.830},
    {'threshold': 0.45, 'exponent': 1.0, 'top1': 0.817},
    {'threshold': 0.45, 'exponent': 1.5, 'top1': 0.834},
    {'threshold': 0.45, 'exponent': 2.0, 'top1': 0.802},
    {'threshold': 0.45, 'exponent': 2.5, 'top1': 0.856},
    {'threshold': 0.50, 'exponent': 0.5, 'top1': 0.818},
    {'threshold': 0.50, 'exponent': 1.0, 'top1': 0.822},
]

# Mark what's already done
completed = {(r['threshold'], r['exponent']) for r in quick_results}

In [12]:
from models.architectures.semantic_symptom_encoder import SemanticSymptomEncoder

non_feature_cols = ['diseases', 'disease_category', 'symptoms', 'age', 'sex', 'age_normalized', 'sex_encoded']
# --- 2. Prepare features ---
feature_cols_demo = [c for c in df_demo_filtered.columns if c not in non_feature_cols]
cols_demo = feature_cols_demo
X_demo_values = df_demo_filtered[cols_demo].fillna(0).values
y_demo_values = df_demo_filtered["diseases"].values
all_diseases_demo = np.unique(y_demo_values)
# --- 3. Build symptom mapping (needs an encoder first) ---
temp_encoder = SemanticSymptomEncoder(model_name="all-mpnet-base-v2", symptom_list=cols_demo)
col_to_symptom_demo = {col: col.lower().replace("_", " ") for col in cols_demo}
# --- 4. Train models (this is the slow part, ~2-3 min) ---
print("Training hierarchical models...")
from sklearn.model_selection import train_test_split
indices_demo = np.arange(len(df_demo_filtered))
train_idx_demo, test_idx_demo = train_test_split(
    indices_demo, test_size=0.1, random_state=42, stratify=y_demo_values
)
X_train_demo = df_demo_filtered[cols_demo].iloc[train_idx_demo]
y_train_demo = y_demo_values[train_idx_demo]
df_train_demo = df_demo_filtered.iloc[train_idx_demo]
cat_clf_demo, specialist_demo = train_hierarchical_model(X_train_demo, y_train_demo, df_train_demo)
print("Models trained!")
# --- 5. Run the sweep ---
BEST_MODEL = "all-mpnet-base-v2"
THRESHOLDS = [0.35, 0.40, 0.45]  # Focus around best region
EXPONENTS = [0.5, 1.0]

for thresh in THRESHOLDS:
    for exp in EXPONENTS:
        if (thresh, exp) in completed:
            continue
        
        encoder = SemanticSymptomEncoder(
            model_name=BEST_MODEL,
            threshold=thresh,
            exponent=exp,
            symptom_list=cols_demo
        )
        
        top1, top5, n = evaluate_user_in_loop(
            encoder=encoder,
            X_true=X_demo_values,
            y_true=y_demo_values,
            symptom_cols=cols_demo,
            col_to_symptom=col_to_symptom_demo,
            predict_fn=predict_hierarchical,
            cat_clf=cat_clf_demo,
            specialist_models=specialist_demo,
            all_diseases=all_diseases_demo,
            top_k=20,
            max_samples=2500  # Full sample for reliable numbers
        )
        
        quick_results.append({
            'threshold': thresh,
            'exponent': exp,
            'top1': top1,
            'top5': top5
        })
        print(f"thresh={thresh}, exp={exp}: Top-1={top1*100:.1f}%, Top-5={top5*100:.1f}%")
best = max(quick_results, key=lambda x: x['top1'])
print(f"\nüèÜ Best: threshold={best['threshold']}, exponent={best['exponent']}, Top-1={best['top1']*100:.1f}%")


[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms
Training hierarchical models...
  Training Category Classifier...
Training SymptomCategoryClassifier with shape (186742, 456)
  Training 17 Specialist Models (parallel, n_jobs=3)...
Models trained!

üèÜ Best: threshold=0.4, exponent=0.5, Top-1=87.2%


In [None]:
# ============================================
# RUN BEST CONFIG WITH FULL SAMPLES & SAVE
# ============================================

# Get best from sweep
BEST_THRESH = best['threshold']
BEST_EXP = best['exponent']

print(f"Running full evaluation for best config: threshold={BEST_THRESH}, exponent={BEST_EXP}")

encoder_best = SemanticSymptomEncoder(
    model_name="all-mpnet-base-v2",
    threshold=BEST_THRESH,
    exponent=BEST_EXP,
    symptom_list=cols_demo
)

top1_final, top5_final, n_final = evaluate_user_in_loop(
    encoder=encoder_best,
    X_true=X_demo_values,
    y_true=y_demo_values,
    symptom_cols=cols_demo,
    col_to_symptom=col_to_symptom_demo,
    predict_fn=predict_hierarchical,
    cat_clf=cat_clf_demo,
    specialist_models=specialist_demo,
    all_diseases=all_diseases_demo,
    top_k=20,
    max_samples=2500  # Full sample
)

print(f"\nüèÜ BEST CONFIG RESULTS")
print(f"   Threshold: {BEST_THRESH}")
print(f"   Exponent:  {BEST_EXP}")
print(f"   Samples:   {n_final}")
print(f"   Top-1:     {top1_final*100:.2f}%")
print(f"   Top-5:     {top5_final*100:.2f}%")


Running full evaluation for best config: threshold=0.4, exponent=0.5
[Encoder] Loading model: all-mpnet-base-v2
[Encoder] Loaded cached symptom embeddings
[Encoder] Initialized with 456 symptoms

üèÜ BEST CONFIG RESULTS
   Threshold: 0.4
   Exponent:  0.5
   Samples:   2496
   Top-1:     84.01%
   Top-5:     97.64%

‚úÖ Results saved to: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\notebooks\figures\rigorous_eval_results.json


In [18]:
results_path = project_root / 'notebooks' / 'figures' / 'rigorous_eval_results.json'

with open(results_path, 'r') as f:
    existing = json.load(f)

existing['pipeline_user_in_loop']['optimized_demo'] = {
    'threshold': BEST_THRESH,
    'exponent': BEST_EXP,
    'top1': top1_final,
    'top5': top5_final,
    'samples': n_final,
    "gold_standard_top1": 0.8186536036472984,
    'degradation': (0.8186536036472984 - top1_final) * 100,
    'model': 'all-mpnet-base-v2'
}

existing['threshold_sweep'] = quick_results

with open(results_path, 'w') as f:
    json.dump(existing, f, indent=2, default=float)

print(f"\n‚úÖ Results saved to: {results_path}")


‚úÖ Results saved to: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\notebooks\figures\rigorous_eval_results.json
