In [10]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Install compatible version combinations
!pip install scikit-learn==1.3.0 imbalanced-learn==0.11.0 --quiet

ModuleNotFoundError: No module named 'sklearn.utils._metadata_requests'

In [8]:
# ============================================================================
# Complete Clinical Trial Stroke Risk Prediction Pipeline
# Part 1: Synthetic Data Generation (CDISC Standards)
# Part 2: Enhanced ML Model with SMOTE + Cross-Validation
# ============================================================================

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, roc_auc_score, roc_curve, 
                            confusion_matrix, precision_recall_curve, f1_score,
                            recall_score, precision_score, fbeta_score)
from imblearn.over_sampling import SMOTE  # For handling class imbalance
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

print("="*80)
print("CLINICAL TRIAL STROKE RISK PREDICTION - COMPLETE PIPELINE")
print("="*80)
print("Part 1: Synthetic Data Generation (CDISC Standards)")
print("Part 2: Enhanced ML Model (SMOTE + 5-Fold CV)")
print("="*80 + "\n")

# ============================================================================
# PART 1: SYNTHETIC DATA GENERATION
# ============================================================================

# 1. Generate Demographics (DM) Table
def generate_demographics(n_subjects=1000):
    """Generate demographic data following CDISC DM domain"""
    subjects = []
    for i in range(1, n_subjects + 1):
        subject = {
            'SUBJID': f'SUBJ{i:04d}',
            'AGE': np.random.normal(65, 12),
            'SEX': np.random.choice(['M', 'F'], p=[0.48, 0.52]),
            'RACE': np.random.choice(['WHITE', 'BLACK', 'ASIAN', 'HISPANIC', 'OTHER'],
                                   p=[0.6, 0.15, 0.15, 0.08, 0.02]),
            'ARM': np.random.choice(['TREATMENT', 'CONTROL'], p=[0.5, 0.5]),
            'RFSTDTC': datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365))
        }
        subject['AGE'] = max(18, min(95, subject['AGE']))
        subjects.append(subject)
    
    df_dm = pd.DataFrame(subjects)
    df_dm['AGE'] = df_dm['AGE'].round(0).astype(int)
    print(f"✓ Generated demographics data for {len(df_dm)} subjects")
    return df_dm

# 2. Generate Vital Signs (VS) Table
def generate_vital_signs(df_dm, visits_per_subject=6):
    """Generate vital signs data following CDISC VS domain"""
    vs_records = []
    for _, subject in df_dm.iterrows():
        subjid = subject['SUBJID']
        age = subject['AGE']
        sex = subject['SEX']
        
        base_sbp = 120 + (age - 50) * 0.5 + (10 if sex == 'M' else 0)
        base_dbp = 80 + (age - 50) * 0.3 + (5 if sex == 'M' else 0)
        base_hr = 70 + np.random.normal(0, 5)
        base_temp = 36.5 + np.random.normal(0, 0.3)
        base_weight = 70 + (10 if sex == 'M' else -5) + np.random.normal(0, 8)
        
        for visit in range(1, visits_per_subject + 1):
            visit_date = subject['RFSTDTC'] + timedelta(days=visit * 30)
            trend_factor = 1 + (visit - 1) * 0.02
            
            vs_records.extend([
                {
                    'SUBJID': subjid,
                    'VSTESTCD': 'SYSBP',
                    'VSTEST': 'Systolic Blood Pressure',
                    'VSORRES': max(90, base_sbp * trend_factor + np.random.normal(0, 8)),
                    'VSORRESU': 'mmHg',
                    'VSDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'VSTESTCD': 'DIABP',
                    'VSTEST': 'Diastolic Blood Pressure',
                    'VSORRES': max(60, base_dbp * trend_factor + np.random.normal(0, 5)),
                    'VSORRESU': 'mmHg',
                    'VSDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'VSTESTCD': 'HR',
                    'VSTEST': 'Heart Rate',
                    'VSORRES': max(50, base_hr + np.random.normal(0, 8)),
                    'VSORRESU': 'beats/min',
                    'VSDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'VSTESTCD': 'TEMP',
                    'VSTEST': 'Temperature',
                    'VSORRES': base_temp + np.random.normal(0, 0.2),
                    'VSORRESU': 'C',
                    'VSDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'VSTESTCD': 'WEIGHT',
                    'VSTEST': 'Weight',
                    'VSORRES': max(40, base_weight + np.random.normal(0, 2)),
                    'VSORRESU': 'kg',
                    'VSDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                }
            ])
    
    df_vs = pd.DataFrame(vs_records)
    df_vs['VSORRES'] = df_vs['VSORRES'].round(1)
    print(f"✓ Generated {len(df_vs)} vital signs records")
    return df_vs

# 3. Generate Laboratory (LB) Table
def generate_lab_data(df_dm, visits_per_subject=6):
    """Generate laboratory data following CDISC LB domain"""
    lb_records = []
    for _, subject in df_dm.iterrows():
        subjid = subject['SUBJID']
        age = subject['AGE']
        
        base_glucose = 100 + np.random.normal(0, 15)
        base_cholesterol = 200 + (age - 50) * 1.5 + np.random.normal(0, 30)
        base_hdl = 50 + np.random.normal(0, 10)
        base_ldl = 130 + np.random.normal(0, 25)
        base_triglycerides = 150 + np.random.normal(0, 40)
        base_creatinine = 1.0 + np.random.normal(0, 0.2)
        
        for visit in range(1, visits_per_subject + 1):
            visit_date = subject['RFSTDTC'] + timedelta(days=visit * 30)
            
            lb_records.extend([
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'GLUCOSE',
                    'LBTEST': 'Glucose',
                    'LBORRES': max(70, base_glucose + np.random.normal(0, 10)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'CHOL',
                    'LBTEST': 'Total Cholesterol',
                    'LBORRES': max(120, base_cholesterol + np.random.normal(0, 15)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'HDL',
                    'LBTEST': 'HDL Cholesterol',
                    'LBORRES': max(25, base_hdl + np.random.normal(0, 8)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'LDL',
                    'LBTEST': 'LDL Cholesterol',
                    'LBORRES': max(50, base_ldl + np.random.normal(0, 20)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'TRIG',
                    'LBTEST': 'Triglycerides',
                    'LBORRES': max(50, base_triglycerides + np.random.normal(0, 30)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                },
                {
                    'SUBJID': subjid,
                    'LBTESTCD': 'CREAT',
                    'LBTEST': 'Creatinine',
                    'LBORRES': max(0.5, base_creatinine + np.random.normal(0, 0.1)),
                    'LBORRESU': 'mg/dL',
                    'LBDTC': visit_date,
                    'VISIT': f'VISIT_{visit}'
                }
            ])
    
    df_lb = pd.DataFrame(lb_records)
    df_lb['LBORRES'] = df_lb['LBORRES'].round(2)
    print(f"✓ Generated {len(df_lb)} laboratory records")
    return df_lb

# 4. Generate Stroke Outcomes
def generate_stroke_outcomes(df_dm, df_vs, df_lb):
    """Generate stroke outcomes based on clinical risk factors"""
    outcomes = []
    
    vs_pivot = df_vs.pivot_table(
        index='SUBJID',
        columns='VSTESTCD',
        values='VSORRES',
        aggfunc='mean'
    ).reset_index()
    
    lb_pivot = df_lb.pivot_table(
        index='SUBJID',
        columns='LBTESTCD',
        values='LBORRES',
        aggfunc='mean'
    ).reset_index()
    
    merged_data = df_dm.merge(vs_pivot, on='SUBJID', how='left')
    merged_data = merged_data.merge(lb_pivot, on='SUBJID', how='left')
    
    for _, row in merged_data.iterrows():
        risk_score = 0
        
        if row['AGE'] > 65:
            risk_score += (row['AGE'] - 65) * 0.1
        
        if 'SYSBP' in row and pd.notna(row['SYSBP']):
            if row['SYSBP'] > 140:
                risk_score += (row['SYSBP'] - 140) * 0.05
        
        if 'CHOL' in row and pd.notna(row['CHOL']):
            if row['CHOL'] > 240:
                risk_score += (row['CHOL'] - 240) * 0.02
        
        if row['SEX'] == 'M':
            risk_score += 1.5
        
        if row['RACE'] == 'BLACK':
            risk_score += 1.0
        
        if 'GLUCOSE' in row and pd.notna(row['GLUCOSE']):
            if row['GLUCOSE'] > 126:
                risk_score += 2.0
        
        if row['ARM'] == 'TREATMENT':
            risk_score *= 0.8
        
        probability = 1 / (1 + np.exp(-(risk_score - 5)))
        stroke = np.random.random() < probability
        
        outcomes.append({
            'SUBJID': row['SUBJID'],
            'STROKE': int(stroke),
            'RISK_SCORE': round(risk_score, 2),
            'STROKE_PROBABILITY': round(probability, 3)
        })
    
    df_outcomes = pd.DataFrame(outcomes)
    stroke_rate = df_outcomes['STROKE'].mean()
    print(f"✓ Generated stroke outcomes, stroke rate: {stroke_rate:.1%}")
    return df_outcomes

# 5. Generate Adverse Events (AE) Table
def generate_adverse_events(df_dm, df_outcomes):
    """Generate adverse events data following CDISC AE domain"""
    ae_records = []
    ae_terms = [
        'HYPERTENSION', 'DIZZINESS', 'HEADACHE', 'FATIGUE',
        'NAUSEA', 'CHEST PAIN', 'PALPITATIONS', 'EDEMA',
        'SHORTNESS OF BREATH', 'MUSCLE CRAMPS'
    ]
    ae_severity = ['MILD', 'MODERATE', 'SEVERE']
    ae_outcomes = ['RECOVERED/RESOLVED', 'RECOVERING/RESOLVING', 'NOT RECOVERED/NOT RESOLVED']
    
    for _, subject in df_dm.iterrows():
        subjid = subject['SUBJID']
        n_aes = np.random.poisson(2)
        
        for ae_num in range(n_aes):
            ae_start = subject['RFSTDTC'] + timedelta(days=np.random.randint(1, 365))
            ae_records.append({
                'SUBJID': subjid,
                'AESEQ': ae_num + 1,
                'AETERM': np.random.choice(ae_terms),
                'AESTDTC': ae_start,
                'AESEV': np.random.choice(ae_severity, p=[0.6, 0.3, 0.1]),
                'AEOUT': np.random.choice(ae_outcomes, p=[0.7, 0.2, 0.1]),
                'AEREL': np.random.choice(['NOT RELATED', 'UNLIKELY', 'POSSIBLE', 'PROBABLE'],
                                        p=[0.4, 0.3, 0.2, 0.1])
            })
    
    df_ae = pd.DataFrame(ae_records)
    print(f"✓ Generated {len(df_ae)} adverse event records")
    return df_ae

# 6. Generate Exposure (EX) Table
def generate_exposure_data(df_dm):
    """Generate exposure data following CDISC EX domain"""
    ex_records = []
    for _, subject in df_dm.iterrows():
        subjid = subject['SUBJID']
        treatment = subject['ARM']
        
        if treatment == 'TREATMENT':
            drug_name = 'STUDY DRUG'
            dose = '10 mg'
            route = 'ORAL'
        else:
            drug_name = 'PLACEBO'
            dose = 'N/A'
            route = 'ORAL'
        
        max_duration = 365
        actual_duration = min(max_duration, np.random.exponential(300))
        start_date = subject['RFSTDTC']
        end_date = start_date + timedelta(days=int(actual_duration))
        
        ex_records.append({
            'SUBJID': subjid,
            'EXTRT': drug_name,
            'EXDOSE': dose,
            'EXDOSU': 'mg' if dose != 'N/A' else '',
            'EXROUTE': route,
            'EXSTDTC': start_date,
            'EXENDTC': end_date,
            'EXDUR': int(actual_duration)
        })
    
    df_ex = pd.DataFrame(ex_records)
    print(f"✓ Generated {len(df_ex)} exposure records")
    return df_ex

# ============================================================================
# PART 2: DATA PREPARATION & FEATURE ENGINEERING
# ============================================================================

def prepare_modeling_dataset(df_dm, df_vs, df_lb, df_outcomes):
    """Prepare dataset for modeling by merging and aggregating clinical data"""
    print("\n" + "="*80)
    print("PREPARING MODELING DATASET")
    print("="*80)
    
    vs_pivot = df_vs.pivot_table(
        index='SUBJID',
        columns='VSTESTCD',
        values='VSORRES',
        aggfunc='mean'
    )
    vs_pivot.columns = [f'VS_{test}' for test in vs_pivot.columns]
    vs_pivot = vs_pivot.reset_index()
    
    lb_pivot = df_lb.pivot_table(
        index='SUBJID',
        columns='LBTESTCD',
        values='LBORRES',
        aggfunc='mean'
    )
    lb_pivot.columns = [f'LB_{test}' for test in lb_pivot.columns]
    lb_pivot = lb_pivot.reset_index()
    
    modeling_data = df_dm.copy()
    modeling_data = modeling_data.merge(vs_pivot, on='SUBJID', how='left')
    modeling_data = modeling_data.merge(lb_pivot, on='SUBJID', how='left')
    modeling_data = modeling_data.merge(df_outcomes, on='SUBJID', how='left')
    
    print(f"✓ Modeling dataset: {len(modeling_data)} subjects, {len(modeling_data.columns)} features")
    return modeling_data

# ============================================================================
# PART 3: ENHANCED MODEL WITH SMOTE + CROSS-VALIDATION
# ============================================================================

def build_enhanced_stroke_model(df_modeling, use_smote=True, n_folds=5):
    """
    Build enhanced model with:
    - SMOTE for class imbalance
    - 5-fold cross-validation with confidence intervals
    - Clinical-focused metrics (prioritizing recall)
    - 500 trees for efficiency
    """
    
    print("\n" + "="*80)
    print("ENHANCED MODEL BUILDING")
    print("="*80)
    print(f"SMOTE: {'Enabled' if use_smote else 'Disabled'}")
    print(f"Cross-Validation: {n_folds}-fold")
    print(f"Random Forest Trees: 500")
    print("="*80 + "\n")
    
    # Prepare features
    feature_columns = ['AGE']
    vs_features = [col for col in df_modeling.columns if col.startswith('VS_')]
    lab_features = [col for col in df_modeling.columns if col.startswith('LB_')]
    feature_columns.extend(vs_features + lab_features)
    feature_columns = [col for col in feature_columns if col in df_modeling.columns]
    
    X = df_modeling[feature_columns].copy()
    y = df_modeling['STROKE'].copy()
    
    # Add categorical features
    for cat_feature in ['SEX', 'RACE', 'ARM']:
        if cat_feature in df_modeling.columns:
            dummies = pd.get_dummies(df_modeling[cat_feature], prefix=cat_feature)
            X = pd.concat([X, dummies], axis=1)
    
    X = X.fillna(X.mean())
    
    print(f"Features: {X.shape[1]} total")
    print(f"Class distribution: No Stroke={sum(y==0)}, Stroke={sum(y==1)}")
    print(f"Imbalance ratio: {(1-y.mean())/y.mean():.1f}:1\n")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Apply SMOTE to training set only
    if use_smote:
        print("Applying SMOTE to training set...")
        print(f"Before SMOTE: No Stroke={sum(y_train==0)}, Stroke={sum(y_train==1)}")
        
        smote = SMOTE(random_state=42, k_neighbors=5)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        
        print(f"After SMOTE:  No Stroke={sum(y_train_balanced==0)}, Stroke={sum(y_train_balanced==1)}")
        print(f"Created {len(y_train_balanced) - len(y_train)} synthetic stroke cases\n")
    else:
        X_train_balanced = X_train
        y_train_balanced = y_train
    
    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(
            random_state=42,
            max_iter=1000,
            class_weight='balanced'
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=500,  # Reduced from 10000
            random_state=42,
            max_depth=15,
            min_samples_split=10,
            min_samples_leaf=4,
            class_weight='balanced'
        )
    }
    
    results = {}
    
    # Train and evaluate
    print("Training models...\n")
    for name, model in models.items():
        print(f"--- {name} ---")
        
        if name == 'Logistic Regression':
            scaler = StandardScaler()
            X_train_proc = scaler.fit_transform(X_train_balanced)
            X_test_proc = scaler.transform(X_test)
            model.fit(X_train_proc, y_train_balanced)
            y_pred_proba = model.predict_proba(X_test_proc)[:, 1]
            y_pred = model.predict(X_test_proc)
        else:
            scaler = None
            model.fit(X_train_balanced, y_train_balanced)
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
        
        # Metrics
        auc_score = roc_auc_score(y_test, y_pred_proba)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        f2 = fbeta_score(y_test, y_pred, beta=2)
        
        print(f"AUC: {auc_score:.3f}")
        print(f"Recall (Sensitivity): {recall:.3f} ⭐")
        print(f"Precision: {precision:.3f}")
        print(f"F2 Score: {f2:.3f}")
        
        # Cross-validation
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        if name == 'Logistic Regression':
            cv_scores = []
            for train_idx, val_idx in skf.split(X_train_balanced, y_train_balanced):
                X_fold_train = X_train_balanced.iloc[train_idx]
                X_fold_val = X_train_balanced.iloc[val_idx]
                y_fold_train = y_train_balanced.iloc[train_idx]
                y_fold_val = y_train_balanced.iloc[val_idx]
                
                fold_scaler = StandardScaler()
                X_fold_train_scaled = fold_scaler.fit_transform(X_fold_train)
                X_fold_val_scaled = fold_scaler.transform(X_fold_val)
                
                model.fit(X_fold_train_scaled, y_fold_train)
                y_fold_proba = model.predict_proba(X_fold_val_scaled)[:, 1]
                cv_scores.append(roc_auc_score(y_fold_val, y_fold_proba))
        else:
            cv_scores = cross_val_score(
                model, X_train_balanced, y_train_balanced,
                cv=skf, scoring='roc_auc', n_jobs=-1
            )
        
        mean_cv = np.mean(cv_scores)
        std_cv = np.std(cv_scores)
        ci_lower = mean_cv - 1.96 * std_cv
        ci_upper = mean_cv + 1.96 * std_cv
        
        print(f"{n_folds}-Fold CV AUC: {mean_cv:.3f} ± {std_cv:.3f}")
        print(f"95% CI: ({ci_lower:.3f}, {ci_upper:.3f})\n")
        
        results[name] = {
            'model': model,
            'scaler': scaler,
            'auc': auc_score,
            'recall': recall,
            'precision': precision,
            'f1': f1,
            'f2': f2,
            'y_pred_proba': y_pred_proba,
            'y_pred': y_pred,
            'features': X.columns.tolist(),
            'cv_mean': mean_cv,
            'cv_std': std_cv,
            'cv_ci': (ci_lower, ci_upper)
        }
    
    # Clinical summary
    print("="*80)
    print("CLINICAL METRICS SUMMARY")
    print("="*80)
    print("⚠️  False Negative (missed stroke) >> False Positive (false alarm)\n")
    
    for name, result in results.items():
        cm = confusion_matrix(y_test, result['y_pred'])
        tn, fp, fn, tp = cm.ravel()
        
        print(f"{name}:")
        print(f"  Caught strokes: {tp}/{tp+fn} ({tp/(tp+fn):.1%})")
        print(f"  Missed strokes: {fn}/{tp+fn} ({fn/(tp+fn):.1%}) ⚠️")
        print(f"  False alarms: {fp}/{tn+fp} ({fp/(tn+fp):.1%}")
        print()
    
    return results, X_test, y_test

# ============================================================================
# PART 4: RISK CALCULATOR
# ============================================================================

def create_stroke_risk_calculator(results, df_modeling):
    """Create enhanced risk calculator"""
    
    # Choose best model by recall
    best_model_name = max(results.keys(), 
                         key=lambda x: (results[x]['recall'], results[x]['auc']))
    
    best_result = results[best_model_name]
    best_model = best_result['model']
    scaler = best_result['scaler']
    feature_names = best_result['features']
    
    print("\n" + "="*80)
    print(f"RISK CALCULATOR: {best_model_name}")
    print("="*80)
    print(f"Test AUC: {best_result['auc']:.3f}")
    print(f"CV AUC: {best_result['cv_mean']:.3f} ± {best_result['cv_std']:.3f}")
    print(f"95% CI: {best_result['cv_ci']}")
    print(f"Recall: {best_result['recall']:.3f} ⭐")
    print("="*80 + "\n")
    
    def predict_stroke_risk(age, sex, systolic_bp, glucose, cholesterol,
                           hdl=50, ldl=130, creatinine=1.0, heart_rate=70,
                           diastolic_bp=80, race='WHITE', treatment='CONTROL',
                           debug=False):
        """Calculate stroke risk for a patient"""
        
        input_data = pd.DataFrame({
            'AGE': [age],
            'VS_SYSBP': [systolic_bp],
            'VS_DIABP': [diastolic_bp],
            'VS_HR': [heart_rate],
            'LB_CHOL': [cholesterol],
            'LB_GLUCOSE': [glucose],
            'LB_HDL': [hdl],
            'LB_LDL': [ldl],
            'LB_CREAT': [creatinine]
        })
        
        input_data['SEX_F'] = 1 if sex == 'F' else 0
        input_data['SEX_M'] = 1 if sex == 'M' else 0
        
        for r in ['ASIAN', 'BLACK', 'HISPANIC', 'OTHER', 'WHITE']:
            input_data[f'RACE_{r}'] = 1 if race == r else 0
        
        input_data['ARM_CONTROL'] = 1 if treatment == 'CONTROL' else 0
        input_data['ARM_TREATMENT'] = 1 if treatment == 'TREATMENT' else 0
        
        for feature in feature_names:
            if feature not in input_data.columns:
                input_data[feature] = 0
        input_data = input_data.reindex(columns=feature_names, fill_value=0)
        
        try:
            if scaler is not None:
                input_proc = scaler.transform(input_data)
                risk_prob = best_model.predict_proba(input_proc)[0, 1]
            else:
                risk_prob = best_model.predict_proba(input_data)[0, 1]
            
            if debug:
                print(f"\n{'='*70}")
                print("PREDICTION DETAILS")
                print(f"{'='*70}")
                print(f"Patient: {age}yo {sex}, SBP={systolic_bp}, Glucose={glucose}, Chol={cholesterol}")
                print(f"Model: {best_model_name}")
                print(f"Raw probability: {risk_prob:.6f}")
                print(f"Stroke risk: {risk_prob*100:.3f}%")
                print(f"Risk category: ", end='')
                if risk_prob < 0.05:
                    print("🟢 Low risk")
                elif risk_prob < 0.15:
                    print("🟡 Moderate risk")
                elif risk_prob < 0.25:
                    print("🟠 High risk")
                else:
                    print("🔴 Very high risk")
                print(f"{'='*70}\n")
            
            return risk_prob
            
        except Exception as e:
            print(f"❌ Error: {e}")
            return 0.0
    
    return predict_stroke_risk

# ============================================================================
# PART 5: VISUALIZATION
# ============================================================================

def plot_data_quality(df_dm, df_outcomes):
    """Plot data quality and distributions"""
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    fig.suptitle('Data Quality Assessment', fontsize=14, fontweight='bold')
    
    # Stroke distribution
    df_outcomes['STROKE'].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title('Stroke Distribution')
    axes[0].set_ylabel('Count')
    axes[0].set_xticklabels(['No Stroke', 'Stroke'], rotation=0)
    
    # Risk score distribution
    axes[1].hist(df_outcomes['RISK_SCORE'], bins=30, alpha=0.7, color='steelblue')
    axes[1].set_title('Risk Score Distribution')
    axes[1].set_xlabel('Risk Score')
    
    # Treatment groups
    df_dm['ARM'].value_counts().plot(kind='pie', ax=axes[2], autopct='%1.1f%%')
    axes[2].set_title('Treatment Groups')
    axes[2].set_ylabel('')
    
    # Stroke by treatment
    stroke_by_arm = df_dm.merge(df_outcomes, on='SUBJID').groupby('ARM')['STROKE'].mean()
    stroke_by_arm.plot(kind='bar', ax=axes[3], color=['coral', 'lightblue'])
    axes[3].set_title('Stroke Rate by Treatment')
    axes[3].set_ylabel('Stroke Rate')
    axes[3].set_xticklabels(axes[3].get_xticklabels(), rotation=0)
    
    plt.tight_layout()
    plt.show()

def plot_model_evaluation(results, X_test, y_test):
    """Plot comprehensive model evaluation"""
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Enhanced Model Evaluation', fontsize=16, fontweight='bold')
    
    # 1. ROC Curves
    axes[0,0].plot([0, 1], [0, 1], 'k--', alpha=0.3)
    for name, result in results.items():
        fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
        axes[0,0].plot(fpr, tpr, linewidth=2,
                      label=f"{name}\nAUC={result['auc']:.3f}\nCV={result['cv_mean']:.3f}±{result['cv_std']:.3f}")
    axes[0,0].set_xlabel('False Positive Rate')
    axes[0,0].set_ylabel('True Positive Rate')
    axes[0,0].set_title('ROC Curves')
    axes[0,0].legend(fontsize=9)
    axes[0,0].grid(True, alpha=0.3)
    
    # 2. Precision-Recall
    for name, result in results.items():
        precision, recall, _ = precision_recall_curve(y_test, result['y_pred_proba'])
        axes[0,1].plot(recall, precision, linewidth=2, label=name)
    axes[0,1].set_xlabel('Recall')
    axes[0,1].set_ylabel('Precision')
    axes[0,1].set_title('Precision-Recall Curves')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. Metrics comparison
    metrics = ['AUC', 'Recall', 'Precision', 'F2']
    x = np.arange(len(metrics))
    width = 0.35
    
    for i, (name, result) in enumerate(results.items()):
        values = [result['auc'], result['recall'], result['precision'], result['f2']]
        axes[0,2].bar(x + i*width, values, width, label=name, alpha=0.8)
    
    axes[0,2].set_ylabel('Score')
    axes[0,2].set_title('Clinical Metrics')
    axes[0,2].set_xticks(x + width/2)
    axes[0,2].set_xticklabels(metrics)
    axes[0,2].legend()
    axes[0,2].set_ylim(0, 1.1)
    axes[0,2].grid(True, alpha=0.3, axis='y')
    
    # 4-5. Confusion matrices
    for i, (name, result) in enumerate(results.items()):
        cm = confusion_matrix(y_test, result['y_pred'])
        tn, fp, fn, tp = cm.ravel()
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1,i],
                   xticklabels=['No Stroke', 'Stroke'],
                   yticklabels=['No Stroke', 'Stroke'])
        axes[1,i].set_ylabel('Actual')
        axes[1,i].set_xlabel('Predicted')
        axes[1,i].set_title(f'{name}\nRecall={result["recall"]:.1%}, Missed={fn} strokes')
    
    # 6. Feature importance (Random Forest)
    if 'Random Forest' in results:
        rf_result = results['Random Forest']
        feature_names = rf_result['features']
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': rf_result['model'].feature_importances_
        }).sort_values('importance', ascending=False).head(10)
        
        axes[1,2].barh(range(len(importance_df)), importance_df['importance'])
        axes[1,2].set_yticks(range(len(importance_df)))
        axes[1,2].set_yticklabels(importance_df['feature'], fontsize=9)
        axes[1,2].invert_yaxis()
        axes[1,2].set_xlabel('Importance')
        axes[1,2].set_title('Top 10 Features (RF)')
        axes[1,2].grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.show()

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("\nStarting complete pipeline...\n")
    
    # PART 1: Generate synthetic data
    print("="*80)
    print("PART 1: GENERATING SYNTHETIC CLINICAL DATA")
    print("="*80 + "\n")
    
    df_dm = generate_demographics(n_subjects=1000)
    df_vs = generate_vital_signs(df_dm, visits_per_subject=4)
    df_lb = generate_lab_data(df_dm, visits_per_subject=4)
    df_outcomes = generate_stroke_outcomes(df_dm, df_vs, df_lb)
    df_ae = generate_adverse_events(df_dm, df_outcomes)
    df_ex = generate_exposure_data(df_dm)
    
    print(f"\n{'='*80}")
    print("DATA GENERATION COMPLETE")
    print(f"{'='*80}")
    print(f"Demographics: {len(df_dm)} records")
    print(f"Vital Signs: {len(df_vs)} records")
    print(f"Laboratory: {len(df_lb)} records")
    print(f"Adverse Events: {len(df_ae)} records")
    print(f"Exposure: {len(df_ex)} records")
    print(f"Outcomes: {len(df_outcomes)} records")
    print(f"Stroke rate: {df_outcomes['STROKE'].mean():.1%}")
    
    # Save data
    df_dm.to_csv('demographics.csv', index=False)
    df_vs.to_csv('vital_signs.csv', index=False)
    df_lb.to_csv('laboratory.csv', index=False)
    df_ae.to_csv('adverse_events.csv', index=False)
    df_ex.to_csv('exposure.csv', index=False)
    df_outcomes.to_csv('stroke_outcomes.csv', index=False)
    print("\n✓ All data saved as CSV files")
    
    # Data quality visualization
    plot_data_quality(df_dm, df_outcomes)
    
    # PART 2: Prepare modeling dataset
    df_modeling = prepare_modeling_dataset(df_dm, df_vs, df_lb, df_outcomes)
    
    # PART 3: Build enhanced model
    results, X_test, y_test = build_enhanced_stroke_model(
        df_modeling,
        use_smote=True,  # ⭐ Enable SMOTE
        n_folds=5        # ⭐ 5-fold CV
    )
    
    # PART 4: Visualize results
    plot_model_evaluation(results, X_test, y_test)
    
    # PART 5: Create risk calculator
    predict_stroke_risk = create_stroke_risk_calculator(results, df_modeling)
    
    # PART 6: Example predictions
    print("\n" + "="*80)
    print("EXAMPLE PREDICTIONS")
    print("="*80 + "\n")
    
    print("Example 1: Moderate-High Risk Patient")
    risk1 = predict_stroke_risk(
        age=72, sex='M', systolic_bp=150,
        glucose=110, cholesterol=220, debug=True
    )
    
    print("Example 2: Very High Risk Patient")
    risk2 = predict_stroke_risk(
        age=80, sex='M', systolic_bp=180,
        glucose=180, cholesterol=280, debug=True
    )
    
    print("Example 3: Low Risk Patient")
    risk3 = predict_stroke_risk(
        age=45, sex='F', systolic_bp=110,
        glucose=90, cholesterol=180, debug=True
    )
    
    print("\n" + "="*80)
    print("PIPELINE COMPLETE!")
    print("="*80)
    print("\nKey Features:")
    print("✓ Synthetic data generated following CDISC standards")
    print("✓ SMOTE applied to balance training data")
    print("✓ 5-fold cross-validation with confidence intervals")
    print("✓ Clinical metrics prioritizing recall (sensitivity)")
    print("✓ 500 trees for computational efficiency")
    print("✓ Risk calculator ready for use")
    print("\nYou can now use predict_stroke_risk() for new patients!")
    print("="*80)

ModuleNotFoundError: No module named 'sklearn.utils._metadata_requests'

In [None]:
# Sample Test
risk = predict_stroke_risk(
    age=72, 
    sex='M',
    systolic_bp=150,
    glucose=110, 
    cholesterol=150,
    debug=True
)
print(f"\nStroke risk: {risk*100:.3f}%")

In [None]:
# Sample Test 2
risk = predict_stroke_risk(
    age=72, 
    sex='M',
    systolic_bp=180,
    glucose=180, 
    cholesterol=240,
    debug=True
)
print(f"\nStroke risk: {risk*100:.3f}%")