In [None]:
# ===========================================================================
# VISUALIZATIONS FOR ASTHMA RISK CLASSIFICATION MODEL
# ===========================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import warnings

warnings.filterwarnings('ignore')
sns.set_style("white")
plt.rcParams['figure.dpi'] = 100

# ============================================================================
# LOAD DATA
# ============================================================================

# Load results
comparison = pd.read_csv('model_comparison_final.csv')
results_df = pd.read_csv('all_model_predictions.csv')
y_test = results_df['y_true'].values
test_weights = results_df['test_weight'].values

# Load feature importance for best model
try:
    feature_imp = pd.read_csv('xgb_feature_importance.csv')
    best_model_name = 'XGBoost'
except:
    # Fallback to any available
    for model_prefix in ['xgb', 'rf', 'et', 'lr']:
        try:
            feature_imp = pd.read_csv(f'{model_prefix}_feature_importance.csv')
            best_model_name = {'xgb': 'XGBoost', 'rf': 'Random Forest', 
                              'et': 'ExtraTrees', 'lr': 'Logistic Regression'}[model_prefix]
            break
        except:
            continue

# ============================================================================
# 1. MODEL COMPARISON BAR CHART
# ============================================================================

fig, ax = plt.subplots(figsize=(15, 5))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
x = np.arange(len(comparison))
width = 0.2

colors = ['#f0555e', '#41b8d5', '#ebac45', '#92b382']

for i, metric in enumerate(metrics):
    offset = width * (i - 1.5)
    bars = ax.bar(x + offset, comparison[metric], width, 
                   label=metric, color=colors[i], alpha=0.8, edgecolor='none')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=12)

ax.set_xlabel('Model', fontsize=14, fontweight='bold')
ax.set_ylabel('Score', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison['Model'], rotation=0, ha='center', fontsize=14, fontweight='bold')
ax.legend(loc='upper right', ncol=1, fontsize=12, frameon=True)
ax.set_ylim([0, 1.0])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig('model_comparison_metrics.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# ============================================================================
# 2. ROC CURVES FOR ALL MODELS
# ============================================================================

fig, ax = plt.subplots(figsize=(5.5, 5))

model_probas = {
    'Logistic Regression': results_df['lr_proba'].values,
    'XGBoost': results_df['xgb_proba'].values,
    'Random Forest': results_df['rf_proba'].values,
    'ExtraTrees': results_df['et_proba'].values,
    'Ensemble': results_df['ensemble_proba'].values,
    'KNN': results_df['knn_proba'].values
}

colors_roc = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']

for (name, proba), color in zip(model_probas.items(), colors_roc):
    fpr, tpr, _ = roc_curve(y_test, proba, sample_weight=test_weights)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, color=color, lw=2.5, 
            label=f'{name} (AUC = {roc_auc:.3f})')

# Diagonal line
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Baseline (AUC = 0.500)')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=14, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=14, fontweight='bold')
ax.set_title('ROC Curves: Model Comparison', fontsize=14, fontweight='bold')
#ax.legend(loc='lower right', ncol=1, fontsize=12, frameon=True)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig('roc_curves_all_models.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================================
# 3. RISK STRATIFICATION HORIZONTAL BAR CHART
# ============================================================================

risk_tiers = results_df['risk_tier'].values

# Calculate statistics for each tier
tier_stats = []
for tier in ['Low', 'Moderate', 'High']:
    mask = risk_tiers == tier
    total_count = mask.sum()
    asthma_count = y_test[mask].sum()
    asthma_rate = y_test[mask].mean() * 100
    tier_stats.append({
        'Risk Tier': tier,
        'Total Samples': total_count,
        'Asthma Cases': int(asthma_count),
        'Asthma Rate (%)': asthma_rate
    })

tier_df = pd.DataFrame(tier_stats)

# Create single plot for asthma prevalence
fig, ax = plt.subplots(figsize=(9, 3))

y_pos = np.arange(len(tier_df))
colors_risk = ['#2ecc71', '#f39c12', '#e74c3c']

bars = ax.barh(y_pos, tier_df['Asthma Rate (%)'], color=colors_risk, alpha=0.8, edgecolor='none')
ax.set_ylabel('Risk Tier', fontsize=14, fontweight='bold')
ax.set_xlabel('Asthma Prevalence (%)', fontsize=14, fontweight='bold')
ax.set_title('Asthma Prevalence by Risk Tier', fontsize=14, fontweight='bold')
ax.set_yticks(y_pos)
ax.set_yticklabels(tier_df['Risk Tier'], fontsize=14)
ax.tick_params(axis='x', labelsize=14)
ax.invert_yaxis()
ax.set_xlim([0, max(tier_df['Asthma Rate (%)']) * 1.2])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add value labels with sample count
for bar, rate, samples in zip(bars, tier_df['Asthma Rate (%)'], tier_df['Total Samples']):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
             f' {rate:.1f}% (n={samples})', ha='left', va='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('risk_stratification_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate risk ratio
low_rate = tier_df[tier_df['Risk Tier'] == 'Low']['Asthma Rate (%)'].values[0]
high_rate = tier_df[tier_df['Risk Tier'] == 'High']['Asthma Rate (%)'].values[0]
risk_ratio = high_rate / low_rate if low_rate > 0 else float('inf')


In [None]:
# ============================================================================
# 4. TREEMAP FOR TOP FEATURES WITH TEXT WRAPPING
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import textwrap
import warnings

warnings.filterwarnings('ignore')

# ============================================================================
# LOAD FEATURE IMPORTANCE DATA
# ============================================================================

# Load model comparison to find best model
comparison = pd.read_csv('model_comparison_final.csv')
best_model_row = comparison.loc[comparison['Composite'].idxmax()]
best_model_name = best_model_row['Model']

# Map model names to file prefixes
model_to_prefix = {
    'Logistic Regression': 'lr',
    'XGBoost': 'xgb',
    'Random Forest': 'rf',
    'ExtraTrees': 'et'
}

# Load feature importance for best model
best_prefix = model_to_prefix.get(best_model_name, 'xgb')
feature_imp = pd.read_csv(f'{best_prefix}_feature_importance.csv')

# ============================================================================
# PREPARE DATA
# ============================================================================

# Get top N features
top_n = 5
if 'importance' in feature_imp.columns:
    top_features = feature_imp.nlargest(top_n, 'importance')
    importance_col = 'importance'
elif 'abs_coefficient' in feature_imp.columns:
    top_features = feature_imp.nlargest(top_n, 'abs_coefficient')
    importance_col = 'abs_coefficient'
else:
    top_features = feature_imp.head(top_n)
    importance_col = feature_imp.columns[1]

# Normalize importances to sum to 360 degrees
importances = top_features[importance_col].values
angles = (importances / importances.sum()) * 360
features = top_features['feature'].values

# ============================================================================
# CREATE TREEMAP
# ============================================================================

fig, ax = plt.subplots(figsize=(7, 10))

# Color palette
colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6']

# Sort by importance (descending)
sorted_indices = np.argsort(importances)[::-1]
features_sorted = features[sorted_indices]
importances_sorted = importances[sorted_indices]

# Normalize importances to get areas
total_area = 1.0
areas = importances_sorted / importances_sorted.sum() * total_area

# Simple treemap layout algorithm (squarified layout)
def squarify(sizes, x, y, width, height):
    """Create a simple squarified treemap layout"""
    rectangles = []
    
    if len(sizes) == 0:
        return rectangles
    
    if len(sizes) == 1:
        rectangles.append((x, y, width, height))
        return rectangles
    
    # Split vertically or horizontally based on aspect ratio
    if width >= height:
        # Split vertically
        total = sum(sizes)
        ratio = sizes[0] / total
        width1 = width * ratio
        rectangles.append((x, y, width1, height))
        rectangles.extend(squarify(sizes[1:], x + width1, y, width - width1, height))
    else:
        # Split horizontally
        total = sum(sizes)
        ratio = sizes[0] / total
        height1 = height * ratio
        rectangles.append((x, y, width, height1))
        rectangles.extend(squarify(sizes[1:], x, y + height1, width, height - height1))
    
    return rectangles

# Get rectangle positions
rectangles = squarify(areas.tolist(), 0, 0, 1, 1)

# Draw rectangles
for i, (rect, feature, importance, color) in enumerate(zip(rectangles, features_sorted, importances_sorted, colors)):
    x, y, width, height = rect
    
    # Draw rectangle
    rectangle = mpatches.Rectangle((x, y), width, height,
                                   facecolor=color, 
                                   edgecolor='white',
                                   linewidth=4,
                                   alpha=0.85)
    ax.add_patch(rectangle)
    
    # Calculate text wrapping based on rectangle width
    text_x = x + width / 2
    text_y = y + height / 2
    
    # Adjust font size based on rectangle size
    font_size = min(14, int(min(width, height) * 80))
    
    # Calculate character width for wrapping (approximate)
    # Each character is roughly 0.6 * font_size pixels, convert to data coordinates
    chars_per_line = max(11, int(width * 100 / (font_size * 0.6)))
    
    # Wrap the feature name
    wrapped_text = textwrap.fill(feature, width=chars_per_line)
    
    # Add feature name (wrapped)
    ax.text(text_x, text_y + height * 0.08, wrapped_text,
            ha='center', va='center',
            fontsize=font_size, fontweight='bold',
            color='white')
    
    # Add importance value
    ax.text(text_x, text_y - height * 0.08, f'{importance:.4f}',
            ha='center', va='center',
            fontsize=font_size - 2,
            color='white',
            fontweight='bold')

# Set limits and styling
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_aspect('equal')
ax.axis('off')

plt.title(f'Top {top_n} Feature Importance - {best_model_name}', 
          fontsize=18, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('treemap_top_features.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()


In [None]:
# ============================================================================
# ASTHMA ONSET PREDICTION VISUALIZATIONS
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import warnings

warnings.filterwarnings('ignore')

# Set style for clean plots
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
sns.set_style("white")

# ============================================================================
# LOAD DATA
# ============================================================================

# Load CV + Test results
results_df = pd.read_csv('onset_detailed_metrics_with_cv.csv')
predictions_df = pd.read_csv('onset_predictions.csv')
cv_folds_df = pd.read_csv('onset_cv_folds.csv')

y_test = predictions_df['y_true'].values
test_weights = predictions_df['test_weight'].values

# Model name mapping
model_name_mapping = {
    'LR': 'Logistic Regression',
    'XGB': 'XGBoost',
    'RF': 'Random Forest',
    'ET': 'Extra Trees'
}

# Apply full names to results_df if needed
if results_df['Model'].iloc[0] in model_name_mapping:
    results_df['Model'] = results_df['Model'].map(model_name_mapping)

# Create models dict with full names
models = {
    'Logistic Regression': predictions_df['lr_proba'].values,
    'XGBoost': predictions_df['xgb_proba'].values,
    'Random Forest': predictions_df['rf_proba'].values,
    'Extra Trees': predictions_df['et_proba'].values
}

# Color palette - use full names
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
model_colors = dict(zip(['Logistic Regression', 'XGBoost', 'Random Forest', 'Extra Trees'], colors))


In [None]:
# ============================================================================
# PLOT 1: AUC-ROC CURVES
# ============================================================================

fig, ax = plt.subplots(figsize=(5.5, 5))

for i, (name, proba) in enumerate(models.items()):
    fpr, tpr, _ = roc_curve(y_test, proba, sample_weight=test_weights)
    roc_auc = auc(fpr, tpr)
    
    # Get CV AUC for label
    cv_auc = results_df.loc[results_df['Model']==name, 'CV_AUC_Mean'].values[0]
    cv_std = results_df.loc[results_df['Model']==name, 'CV_AUC_Std'].values[0]
    
    ax.plot(fpr, tpr, linewidth=2.5, 
            label=f'{name} (Test={roc_auc:.3f}, CV={cv_auc:.3f}Â±{cv_std:.3f})',
            color=model_colors[name])

# Baseline
ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, alpha=0.5, label='Random Baseline')

ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC Curves - Asthma Onset Prediction', fontsize=14, fontweight='bold', pad=15)
ax.legend(loc='lower right', fontsize=9, frameon=True, fancybox=True, shadow=True)
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])

# Remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig('onset_roc_curves.png', dpi=300, bbox_inches='tight', facecolor='none')
plt.show()



In [None]:
# ============================================================================
# PLOT 2: ALL METRICS IN SINGLE HORIZONTAL BAR CHART
# ============================================================================

fig, ax = plt.subplots(figsize=(9, 10))

models_list = results_df['Model'].tolist()

# Get test metrics only
accuracy = results_df['Test_Accuracy'].values
precision = results_df['Test_Precision_Adult'].values
recall = results_df['Test_Recall_Adult'].values
f1 = results_df['Test_F1_Adult'].values

# Color scheme for metrics
metric_colors = ['#f0555e', '#41b8d5', '#ebac45', '#92b382']

# Create positions for grouped bars
n_models = len(models_list)
n_metrics = 4
bar_height = 0.18
y_positions = np.arange(n_models)

# Plot each metric (no edge colors)
bars1 = ax.barh(y_positions - 1.5*bar_height, accuracy, bar_height, 
                label='Accuracy', color=metric_colors[0], alpha=1)
bars2 = ax.barh(y_positions - 0.5*bar_height, precision, bar_height, 
                label='Precision (Adult)', color=metric_colors[1], alpha=1)
bars3 = ax.barh(y_positions + 0.5*bar_height, recall, bar_height, 
                label='Recall (Adult)', color=metric_colors[2], alpha=1)
bars4 = ax.barh(y_positions + 1.5*bar_height, f1, bar_height, 
                label='F1 Score (Adult)', color=metric_colors[3], alpha=1)

# Add value labels
for i, (acc, prec, rec, f1_val) in enumerate(zip(accuracy, precision, recall, f1)):
    ax.text(acc + 0.01, i - 1.5*bar_height, f'{acc:.3f}', 
            va='center', fontsize=9, fontweight='bold')
    ax.text(prec + 0.01, i - 0.5*bar_height, f'{prec:.3f}', 
            va='center', fontsize=9, fontweight='bold')
    ax.text(rec + 0.01, i + 0.5*bar_height, f'{rec:.3f}', 
            va='center', fontsize=9, fontweight='bold')
    ax.text(f1_val + 0.01, i + 1.5*bar_height, f'{f1_val:.3f}', 
            va='center', fontsize=9, fontweight='bold')

ax.set_yticks(y_positions)
ax.set_yticklabels(models_list, fontweight='bold', fontsize=14)
ax.set_xlabel('Score', fontsize=14, fontweight='bold')
ax.set_ylabel('Model', fontsize=14, fontweight='bold')
ax.set_title('Model Performance Comparison - Test Set', fontsize=15, fontweight='bold', pad=15)
ax.set_xlim([0, 1])
#ax.legend(fontsize=12, frameon=True, fancybox=True, loc='upper right', ncol = 4)

# Remove only top and right spines (keep x and y axis lines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig('onset_all_metrics.png', dpi=600, bbox_inches='tight', facecolor='white')
plt.show()


In [None]:
# ============================================================================
# PLOT 3: PROBABILITY DISTRIBUTION
# ============================================================================

# Get best model (based on Test AUC)
best_model_name = results_df.loc[results_df['Test_AUC'].idxmax(), 'Model']
best_proba = models[best_model_name]

fig, ax = plt.subplots(figsize=(11, 5))

# Separate probabilities by true class
childhood_proba = best_proba[y_test == 0]
adult_proba = best_proba[y_test == 1]

ax.hist(childhood_proba, bins=30, alpha=0.7, label='Childhood Onset (True)',
        color='#3498db', edgecolor='black', linewidth=1.2, density=True)
ax.hist(adult_proba, bins=30, alpha=0.7, label='Adult Onset (True)',
        color='#e74c3c', edgecolor='black', linewidth=1.2, density=True)

# Add threshold line
ax.axvline(x=0.5, color='black', linestyle='--', linewidth=2, 
           label='Decision Threshold (0.5)', alpha=0.7)

ax.set_xlabel('Predicted Probability (Adult Onset)', fontsize=14, fontweight='bold')
ax.set_ylabel('Density', fontsize=14, fontweight='bold')
ax.set_title(f'Probability Distribution - {best_model_name} Model', 
             fontsize=14, fontweight='bold', pad=15)
ax.legend(fontsize=10, frameon=True, fancybox=True)

# Remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig('onset_probability_distribution.png', dpi=600, bbox_inches='tight', facecolor='white')
plt.show()
