In [2]:
# =============================================================================
# PART 1: ASTHMA RISK CLASSIFICATION
# =============================================================================
# MISSING DATA HANDLING & TRAIN/TEST SPLIT
# =============================================================================

import pandas as pd
import numpy as np

# Load cleaned data
df = pd.read_csv("merged_nhanes_data_cleaned.csv")

# Drop rows with missing target variable
df = df[df['Asthmatic'].notna()]

# Identify weight columns
weight_cols = [col for col in df.columns if 'weight' in col.lower() or col.startswith('WT')]

# Drop rows with >40% missing data
row_missing_pct = df.isnull().sum(axis=1) / len(df.columns)
df = df[row_missing_pct <= 0.4]

# Fill remaining missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ['Asthmatic'] + weight_cols]

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown'
        df[col].fillna(mode_val, inplace=True)

print(f"Final dataset: {df.shape}")

# Train/test split
train, test = train_test_split(df, test_size=0.2, stratify=df['Asthmatic'], random_state=42)

print(f"Train: {train.shape}, Test: {test.shape}")

# Save
train.to_csv("train_cleaned.csv", index=False)
test.to_csv("test_cleaned.csv", index=False)

Final dataset: (12736, 31)
Train: (10188, 31), Test: (2548, 31)


In [3]:
# ==============================================================================
# SAMPLING WEIGHTS EXTRACTION
# ==============================================================================
import pickle

# Load data
train = pd.read_csv("train_cleaned.csv")
test = pd.read_csv("test_cleaned.csv")

# Define feature-weight mappings
INTERVIEW_WEIGHT_FEATURES = [
    "Mother smoked when pregnant", "Child was overweight",
    "Number of people who smoke inside the house", "Used any tobacco product in last 5 days",
    "At least 100 cigarettes in life", "Age started smoking", "Currently smokes",
    "Cigarettes smoked in entire life (12-17)", "Asthmatic", "Age when first had asthma",
    "Still have asthma", "Asthma attack in past year", "Emergency care visit for asthma/past yr",
    "Taking treatment for anemia", "Vigorous work activity", "High blood pressure",
    "Shortness of breath on stairs", "Vigorous recreational activities", "Avg # alcoholic drinks/day"
]

MEC_WEIGHT_FEATURES = [
    "BMI", "Eosinophils", "White blood cell count", "Basophils",
    "Red blood cell count", "Total Cholesterol", "Urinary Total NNAL"
]

# Find weight columns
interview_weight = None
mec_weight = None

for col in train.columns:
    if 'interview weight' in col.lower():
        interview_weight = col
    if 'mec' in col.lower() and 'weight' in col.lower():
        mec_weight = col

# Create uniform weights if not found
if not interview_weight:
    train['interview_weight'] = 1.0
    test['interview_weight'] = 1.0
    interview_weight = 'interview_weight'
else:
    print(f"Interview weight: {interview_weight}")

if not mec_weight:
    train['mec_weight'] = 1.0
    test['mec_weight'] = 1.0
    mec_weight = 'mec_weight'
else:
    print(f"MEC weight: {mec_weight}")

# Check which features are present
interview_present = [f for f in INTERVIEW_WEIGHT_FEATURES if f in train.columns]
mec_present = [f for f in MEC_WEIGHT_FEATURES if f in train.columns]

# Save weight info
weight_info = {
    'interview_weight': interview_weight,
    'mec_weight': mec_weight,
    'interview_features': interview_present,
    'mec_features': mec_present
}

with open("weight_info.pkl", "wb") as f:
    pickle.dump(weight_info, f)

# Extract and save weight arrays
np.save("train_interview_weights.npy", train[interview_weight].values)
np.save("test_interview_weights.npy", test[interview_weight].values)
np.save("train_mec_weights.npy", train[mec_weight].values)
np.save("test_mec_weights.npy", test[mec_weight].values)

print(f"Interview features: {len(interview_present)}/{len(INTERVIEW_WEIGHT_FEATURES)}")
print(f"MEC features: {len(mec_present)}/{len(MEC_WEIGHT_FEATURES)}")

Interview weight: Full sample 2 year interview weight
MEC weight: Full sample 2 year MEC exam weight
Interview features: 19/19
MEC features: 7/7


In [4]:
# ==============================================================================
# FEATURE SELECTION WITH WEIGHTED STATISTICAL TESTS
# ==============================================================================

from scipy.stats import chi2_contingency, t
import warnings
warnings.filterwarnings('ignore')

# Load data and weights
train = pd.read_csv("train_cleaned.csv")

with open("weight_info.pkl", "rb") as f:
    weight_info = pickle.load(f)

interview_weight = weight_info['interview_weight']
mec_weight = weight_info['mec_weight']
interview_features = weight_info['interview_features']
mec_features = weight_info['mec_features']

protected_cols = [interview_weight, mec_weight, 'Asthmatic']
all_features = [col for col in train.columns if col not in protected_cols]

print(f"Features to evaluate: {len(all_features)}")

# ==============================================================================
# WEIGHT ASSIGNMENT FUNCTION
# ==============================================================================

def get_weight_for_feature(feature_name):
    """Match each feature to its appropriate weight"""
    if feature_name in mec_features:
        return mec_weight
    elif feature_name in interview_features:
        return interview_weight
    else:
        return interview_weight

# ==============================================================================
# WEIGHTED STATISTICAL TESTS
# ==============================================================================

def weighted_pearson(x, y, weights):
    """Weighted Pearson correlation with effective sample size"""
    valid_mask = ~(np.isnan(x) | np.isnan(y) | np.isnan(weights))
    x, y, w = x[valid_mask], y[valid_mask], weights[valid_mask]
    
    if len(x) < 10:
        return np.nan, 1.0, 0
    
    # Normalize weights
    w = w / w.sum()
    
    # Weighted correlation
    x_mean = np.sum(w * x)
    y_mean = np.sum(w * y)
    cov = np.sum(w * (x - x_mean) * (y - y_mean))
    x_std = np.sqrt(np.sum(w * (x - x_mean)**2))
    y_std = np.sqrt(np.sum(w * (y - y_mean)**2))
    
    if x_std > 0 and y_std > 0:
        corr = cov / (x_std * y_std)
    else:
        return np.nan, 1.0, 0
    
    # Effective sample size (Kish's approximation)
    n_eff = (w.sum()**2) / np.sum(w**2)
    
    # P-value
    if abs(corr) < 0.9999:
        t_stat = corr * np.sqrt(n_eff - 2) / np.sqrt(1 - corr**2)
        p_value = 2 * (1 - t.cdf(abs(t_stat), n_eff - 2))
    else:
        p_value = 0.0
    
    return corr, p_value, n_eff

def weighted_chi_square(x, y, weights):
    """Weighted chi-square test with Cramér's V"""
    df = pd.DataFrame({'x': x, 'y': y, 'w': weights}).dropna()
    
    if len(df) < 10:
        return np.nan, 1.0, 0
    
    # Weighted contingency table
    contingency = df.groupby(['x', 'y'])['w'].sum().unstack(fill_value=0)
    
    if contingency.shape[0] < 2 or contingency.shape[1] < 2:
        return np.nan, 1.0, 0
    
    try:
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        n = contingency.sum().sum()
        min_dim = min(contingency.shape[0], contingency.shape[1]) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0
        
        # Effective sample size
        w = df['w'].values
        n_eff = (w.sum()**2) / (w**2).sum()
        
        return cramers_v, p_value, n_eff
    except:
        return np.nan, 1.0, 0

# ==============================================================================
# CATEGORIZE FEATURES
# ==============================================================================

continuous_features = []
binary_features = []
categorical_features = []

for col in all_features:
    n_unique = train[col].dropna().nunique()
    
    if n_unique == 2:
        binary_features.append(col)
    elif train[col].dtype in ['int64', 'float64'] and n_unique > 10:
        continuous_features.append(col)
    else:
        categorical_features.append(col)

print(f"Continuous: {len(continuous_features)}, Binary: {len(binary_features)}, Categorical: {len(categorical_features)}")

# ==============================================================================
# RUN WEIGHTED TESTS
# ==============================================================================

all_results = []

# Test continuous features
for col in continuous_features:
    weight_col = get_weight_for_feature(col)
    valid_mask = train[col].notna() & train['Asthmatic'].notna() & train[weight_col].notna()
    
    if valid_mask.sum() < 10:
        continue
    
    x = train.loc[valid_mask, col].values
    y = train.loc[valid_mask, 'Asthmatic'].values
    w = train.loc[valid_mask, weight_col].values
    
    corr, p_value, n_eff = weighted_pearson(x, y, w)
    
    if not np.isnan(corr):
        all_results.append({
            'feature': col,
            'correlation': abs(corr),
            'p_value': p_value,
            'n_effective': int(n_eff),
            'test': 'Weighted Pearson',
            'weight_used': 'MEC' if weight_col == mec_weight else 'Interview'
        })

# Test binary features
for col in binary_features:
    weight_col = get_weight_for_feature(col)
    valid_mask = train[col].notna() & train['Asthmatic'].notna() & train[weight_col].notna()
    
    if valid_mask.sum() < 10:
        continue
    
    x = train.loc[valid_mask, col].values
    y = train.loc[valid_mask, 'Asthmatic'].values
    w = train.loc[valid_mask, weight_col].values
    
    cramers_v, p_value, n_eff = weighted_chi_square(x, y, w)
    
    if not np.isnan(cramers_v):
        all_results.append({
            'feature': col,
            'correlation': cramers_v,
            'p_value': p_value,
            'n_effective': int(n_eff),
            'test': 'Weighted Chi-Square',
            'weight_used': 'MEC' if weight_col == mec_weight else 'Interview'
        })

# Test categorical features
for col in categorical_features:
    if train[col].nunique() > 20:
        continue
    
    weight_col = get_weight_for_feature(col)
    valid_mask = train[col].notna() & train['Asthmatic'].notna() & train[weight_col].notna()
    
    if valid_mask.sum() < 10:
        continue
    
    x = train.loc[valid_mask, col].values
    y = train.loc[valid_mask, 'Asthmatic'].values
    w = train.loc[valid_mask, weight_col].values
    
    cramers_v, p_value, n_eff = weighted_chi_square(x, y, w)
    
    if not np.isnan(cramers_v):
        all_results.append({
            'feature': col,
            'correlation': cramers_v,
            'p_value': p_value,
            'n_effective': int(n_eff),
            'test': 'Weighted Chi-Square',
            'weight_used': 'MEC' if weight_col == mec_weight else 'Interview'
        })

# ==============================================================================
# SELECT FEATURES
# ==============================================================================

all_results_df = pd.DataFrame(all_results).sort_values('correlation', ascending=False)

# Selection criteria
selected_df = all_results_df[
    (all_results_df['correlation'] > 0.05) & 
    (all_results_df['p_value'] < 0.05)
]

# Display top features
print(f"\nTop 15 features:")
for i, row in selected_df.head(15).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*"
    print(f"  {row['feature'][:40]:40s}: {row['correlation']:.4f} {sig} [{row['weight_used']}]")

# ==============================================================================
# SAVE RESULTS
# ==============================================================================

selected_features = selected_df['feature'].tolist()

with open("selected_features.txt", "w") as f:
    for feature in selected_features:
        f.write(feature + "\n")

all_results_df.to_csv("feature_selection_results.csv", index=False)
selected_df.to_csv("selected_features_stats.csv", index=False)

Features to evaluate: 28
Continuous: 11, Binary: 14, Categorical: 3

Selected features: 10/28
  Interview weighted: 9
  MEC weighted: 1

Top 15 features:
  Still have asthma                       : 0.5787 *** [Interview]
  Asthma attack in past year              : 0.5054 *** [Interview]
  Age when first had asthma               : 0.3656 *** [Interview]
  Emergency care visit for asthma/past yr : 0.2708 *** [Interview]
  Eosinophils                             : 0.1146 *** [MEC]
  Age                                     : 0.1086 *** [Interview]
  Child was overweight                    : 0.0868 *** [Interview]
  Shortness of breath on stairs           : 0.0710 *** [Interview]
  Mother smoked when pregnant             : 0.0573 *** [Interview]
  Gender                                  : 0.0535 *** [Interview]


In [5]:
# ============================================================================
# PREPROCESSING FOR MODELS
# ============================================================================

train = pd.read_csv("train_cleaned.csv")
test = pd.read_csv("test_cleaned.csv")

with open("weight_info.pkl", "rb") as f:
    weight_info = pickle.load(f)

with open("selected_features.txt", "r") as f:
    selected_features = [line.strip() for line in f.readlines()]

print(f"Loaded: {len(selected_features)} selected features")

# ============================================================================
# REMOVE DATA LEAKAGE FEATURES
# ============================================================================

LEAKAGE_FEATURES = [
    "Still have asthma",
    "Asthma attack in past year",
    "Emergency care visit for asthma/past yr",
    "Age when first had asthma", "Age"
]

clean_features = [f for f in selected_features if f not in LEAKAGE_FEATURES]
removed = len(selected_features) - len(clean_features)

print(f"Removed {removed} leakage features -> {len(clean_features)} features remaining")

# ============================================================================
# CLASSIFY AND ENCODE FEATURES
# ============================================================================

continuous_features = []
binary_features = []
categorical_features = []

for col in clean_features:
    n_unique = train[col].dropna().nunique()
    
    if n_unique == 2:
        binary_features.append(col)
    elif train[col].dtype in ['int64', 'float64'] and n_unique > 10:
        continuous_features.append(col)
    else:
        categorical_features.append(col)

print(f"Feature types: {len(continuous_features)} continuous, {len(binary_features)} binary, {len(categorical_features)} categorical")

# Encode categorical features
train_encoded = train.copy()
test_encoded = test.copy()
encoding_info = {}

for col in categorical_features:
    n_categories = train[col].nunique()
    
    if n_categories <= 5:
        # One-hot encoding
        train_dummies = pd.get_dummies(train[col], prefix=col, drop_first=True)
        test_dummies = pd.get_dummies(test[col], prefix=col, drop_first=True)
        
        for dummy_col in train_dummies.columns:
            if dummy_col not in test_dummies.columns:
                test_dummies[dummy_col] = 0
        
        train_encoded = pd.concat([train_encoded, train_dummies], axis=1)
        test_encoded = pd.concat([test_encoded, test_dummies], axis=1)
        
        clean_features.remove(col)
        clean_features.extend(train_dummies.columns.tolist())
        
        encoding_info[col] = {'method': 'one-hot', 'new_columns': train_dummies.columns.tolist()}
    else:
        # Ordinal encoding
        categories = sorted(train[col].dropna().unique())
        mapping = {cat: i for i, cat in enumerate(categories)}
        
        train_encoded[col] = train[col].map(mapping)
        test_encoded[col] = test[col].map(mapping)
        
        if test_encoded[col].isna().any():
            most_common = train_encoded[col].mode()[0]
            test_encoded[col] = test_encoded[col].fillna(most_common)
        
        encoding_info[col] = {'method': 'ordinal', 'mapping': mapping}

if categorical_features:
    print(f"Encoded {len(categorical_features)} categorical features")

# ============================================================================
# PREPARE FINAL DATASETS
# ============================================================================

X_train = train_encoded[clean_features].copy()
X_test = test_encoded[clean_features].copy()
y_train = train_encoded['Asthmatic'].copy()
y_test = test_encoded['Asthmatic'].copy()

# Handle missing values
train_missing = X_train.isnull().sum().sum()
test_missing = X_test.isnull().sum().sum()

if train_missing > 0 or test_missing > 0:
    for col in X_train.columns:
        if X_train[col].isnull().any():
            median_val = X_train[col].median()
            X_train[col] = X_train[col].fillna(median_val)
            X_test[col] = X_test[col].fillna(median_val)
    print(f"Imputed {train_missing + test_missing} missing values")

# ============================================================================
# DETERMINE SAMPLE WEIGHTS
# ============================================================================

mec_features = weight_info['mec_features']
has_mec = any(f in mec_features for f in clean_features)

if has_mec:
    train_weights = train_encoded[weight_info['mec_weight']].values
    test_weights = test_encoded[weight_info['mec_weight']].values
    weight_type = 'mec'
else:
    train_weights = train_encoded[weight_info['interview_weight']].values
    test_weights = test_encoded[weight_info['interview_weight']].values
    weight_type = 'interview'

print(f"Using {weight_type} weights")

# ============================================================================
# SAVE PROCESSED DATA
# ============================================================================

X_train.to_csv("X_train_processed.csv", index=False)
X_test.to_csv("X_test_processed.csv", index=False)
y_train.to_csv("y_train.csv", index=False, header=True)
y_test.to_csv("y_test.csv", index=False, header=True)

np.save("train_sample_weights.npy", train_weights)
np.save("test_sample_weights.npy", test_weights)

final_info = {
    'features': clean_features,
    'continuous_features': continuous_features,
    'binary_features': binary_features,
    'categorical_features': categorical_features,
    'encoding_info': encoding_info,
    'removed_leakage': removed,
    'weight_type_used': weight_type
}

with open("processed_data_info.pkl", "wb") as f:
    pickle.dump(final_info, f)

Loaded: 10 selected features
Removed 5 leakage features -> 5 features remaining
Feature types: 1 continuous, 4 binary, 0 categorical
Imputed 11284 missing values
Using mec weights
Final features: 5
Train asthma rate: 18.69%
Test asthma rate: 18.68%


In [6]:
# =========================================================
# RISK STRATIFICATION WITH FEATURE ENGINEERING
# =========================================================

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import joblib

warnings.filterwarnings('ignore')

# ============================================================================
# LOAD DATA
# ============================================================================

X_train = pd.read_csv("X_train_processed.csv")
X_test = pd.read_csv("X_test_processed.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()
train_weights = np.load("train_sample_weights.npy")
test_weights = np.load("test_sample_weights.npy")

with open("processed_data_info.pkl", "rb") as f:
    data_info = pickle.load(f)

imbalance_ratio = (1-y_train).sum() / y_train.sum()
print(f"Data: {X_train.shape[0]} train, {X_test.shape[0]} test")
print(f"Imbalance: {imbalance_ratio:.2f}:1 | Train asthma: {y_train.mean():.2%} | Test asthma: {y_test.mean():.2%}\n")

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

X_train_fe = X_train.copy()
X_test_fe = X_test.copy()
original_count = X_train_fe.shape[1]

def safe_add(train_df, test_df, name, train_vals, test_vals):
    if name not in train_df.columns:
        train_df[name] = train_vals
        test_df[name] = test_vals
        return True
    return False

added = 0

# Domain interactions
if 'Age' in X_train_fe.columns and 'Currently smokes' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'age_x_smoking',
                     X_train_fe['Age'] * X_train_fe['Currently smokes'],
                     X_test_fe['Age'] * X_test_fe['Currently smokes'])

if 'BMI' in X_train_fe.columns and 'Age' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'bmi_x_age',
                     X_train_fe['BMI'] * X_train_fe['Age'],
                     X_test_fe['BMI'] * X_test_fe['Age'])

if 'Eosinophils' in X_train_fe.columns and 'White blood cell count' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'eosinophils_x_wbc',
                     X_train_fe['Eosinophils'] * X_train_fe['White blood cell count'],
                     X_test_fe['Eosinophils'] * X_test_fe['White blood cell count'])

if 'Gender' in X_train_fe.columns and 'Currently smokes' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'gender_x_smoking',
                     X_train_fe['Gender'] * X_train_fe['Currently smokes'],
                     X_test_fe['Gender'] * X_test_fe['Currently smokes'])

# BMI categories
if 'BMI' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'bmi_underweight',
                     (X_train_fe['BMI'] < 18.5).astype(int),
                     (X_test_fe['BMI'] < 18.5).astype(int))
    added += safe_add(X_train_fe, X_test_fe, 'bmi_overweight',
                     ((X_train_fe['BMI'] >= 25) & (X_train_fe['BMI'] < 30)).astype(int),
                     ((X_test_fe['BMI'] >= 25) & (X_test_fe['BMI'] < 30)).astype(int))
    added += safe_add(X_train_fe, X_test_fe, 'bmi_obese',
                     (X_train_fe['BMI'] >= 30).astype(int),
                     (X_test_fe['BMI'] >= 30).astype(int))

# Age groups
if 'Age' in X_train_fe.columns:
    added += safe_add(X_train_fe, X_test_fe, 'age_child',
                     (X_train_fe['Age'] < 18).astype(int),
                     (X_test_fe['Age'] < 18).astype(int))
    added += safe_add(X_train_fe, X_test_fe, 'age_senior',
                     (X_train_fe['Age'] >= 65).astype(int),
                     (X_test_fe['Age'] >= 65).astype(int))

# Polynomial features
continuous_cols = [col for col in ['Age', 'BMI', 'Eosinophils', 'White blood cell count'] 
                   if col in X_train_fe.columns]

if continuous_cols:
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_fe[continuous_cols])
    X_test_poly = poly.transform(X_test_fe[continuous_cols])
    
    poly_names = poly.get_feature_names_out(continuous_cols)
    new_poly_names = [name for name in poly_names if name not in continuous_cols]
    
    X_train_poly_df = pd.DataFrame(X_train_poly[:, len(continuous_cols):],
                                    columns=new_poly_names, index=X_train_fe.index)
    X_test_poly_df = pd.DataFrame(X_test_poly[:, len(continuous_cols):],
                                   columns=new_poly_names, index=X_test_fe.index)
    
    X_train_fe = pd.concat([X_train_fe, X_train_poly_df], axis=1)
    X_test_fe = pd.concat([X_test_fe, X_test_poly_df], axis=1)

print(f"Feature Engineering: {original_count} → {X_train_fe.shape[1]} ({added} domain + {len(new_poly_names)} polynomial)\n")

X_train_fe = X_train_fe.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_fe = X_test_fe.replace([np.inf, -np.inf], np.nan).fillna(0)

# ============================================================================
# CLASS BALANCING & STANDARDIZATION
# ============================================================================

class_weights = compute_sample_weight('balanced', y_train)
train_weights_balanced = train_weights * class_weights
train_weights_balanced = train_weights_balanced * (train_weights.sum() / train_weights_balanced.sum())

print(f"Class Weighting: minority {class_weights[y_train==1].mean():.2f}x, majority {class_weights[y_train==0].mean():.2f}x\n")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

# ============================================================================
# MODEL TRAINING
# ============================================================================

print("Training models...\n")

# Logistic Regression
best_lr_auc = 0
for C in [0.01, 0.1, 1.0, 10.0]:
    lr = LogisticRegression(C=C, penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
    lr.fit(X_train_scaled, y_train, sample_weight=train_weights_balanced)
    auc = roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:, 1], sample_weight=test_weights)
    if auc > best_lr_auc:
        best_lr_auc, lr_model = auc, lr

lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# XGBoost
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
best_xgb_auc = 0
for n_est, depth, lr_rate in [(100, 3, 0.1), (100, 5, 0.1), (200, 5, 0.05), (200, 7, 0.05)]:
    xgb_temp = xgb.XGBClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr_rate,
                                  subsample=0.8, colsample_bytree=0.8, gamma=1,
                                  scale_pos_weight=scale_pos_weight, random_state=42,
                                  eval_metric='auc', use_label_encoder=False)
    xgb_temp.fit(X_train_fe, y_train, sample_weight=train_weights_balanced, verbose=False)
    auc = roc_auc_score(y_test, xgb_temp.predict_proba(X_test_fe)[:, 1], sample_weight=test_weights)
    if auc > best_xgb_auc:
        best_xgb_auc, xgb_model = auc, xgb_temp

xgb_pred_proba = xgb_model.predict_proba(X_test_fe)[:, 1]

# Random Forest
best_rf_auc = 0
for n_est, depth, min_split in [(200, 10, 20), (200, 15, 15), (300, 12, 20), (200, None, 15)]:
    rf_temp = RandomForestClassifier(n_estimators=n_est, max_depth=depth, min_samples_split=min_split,
                                      min_samples_leaf=10, max_features='sqrt', random_state=42, n_jobs=-1)
    rf_temp.fit(X_train_fe, y_train, sample_weight=train_weights_balanced)
    auc = roc_auc_score(y_test, rf_temp.predict_proba(X_test_fe)[:, 1], sample_weight=test_weights)
    if auc > best_rf_auc:
        best_rf_auc, rf_model = auc, rf_temp

rf_pred_proba = rf_model.predict_proba(X_test_fe)[:, 1]

# Extra Trees
best_et_auc = 0
for n_est, depth, min_split in [(150, 10, 15), (200, 12, 15), (200, 15, 15), (300, 12, 20), (200, None, 15)]:
    et_temp = ExtraTreesClassifier(n_estimators=n_est, max_depth=depth, min_samples_split=min_split,
                                     min_samples_leaf=8, max_features='sqrt', bootstrap=True, 
                                     random_state=42, n_jobs=-1)
    et_temp.fit(X_train_fe, y_train, sample_weight=train_weights_balanced)
    auc = roc_auc_score(y_test, et_temp.predict_proba(X_test_fe)[:, 1], sample_weight=test_weights)
    if auc > best_et_auc:
        best_et_auc, et_model = auc, et_temp

et_pred_proba = et_model.predict_proba(X_test_fe)[:, 1]

# KNN
knn_model = KNeighborsClassifier(n_neighbors=15, weights='distance', algorithm='auto', n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)
knn_pred_proba = knn_model.predict_proba(X_test_scaled)[:, 1]
knn_auc = roc_auc_score(y_test, knn_pred_proba, sample_weight=test_weights)

# Ensemble
best_ens_auc = 0
for w in [(0.4, 0.3, 0.3, 0.0), (0.3, 0.3, 0.3, 0.1), (0.35, 0.25, 0.25, 0.15), 
          (0.5, 0.2, 0.2, 0.1), (0.25, 0.25, 0.25, 0.25)]:
    ens_proba = w[0]*lr_pred_proba + w[1]*xgb_pred_proba + w[2]*rf_pred_proba + w[3]*et_pred_proba
    auc = roc_auc_score(y_test, ens_proba, sample_weight=test_weights)
    if auc > best_ens_auc:
        best_ens_auc, best_ens_weights, ensemble_proba = auc, w, ens_proba

# ============================================================================
# MODEL COMPARISON
# ============================================================================

def calc_metrics(y_true, y_proba, weights):
    y_pred = (y_proba >= 0.5).astype(int)
    return {
        'Accuracy': accuracy_score(y_true, y_pred, sample_weight=weights),
        'Precision': precision_score(y_true, y_pred, sample_weight=weights, zero_division=0),
        'Recall': recall_score(y_true, y_pred, sample_weight=weights, zero_division=0),
        'F1': f1_score(y_true, y_pred, sample_weight=weights, zero_division=0)
    }

models_data = {
    'Logistic Regression': (best_lr_auc, lr_pred_proba),
    'XGBoost': (best_xgb_auc, xgb_pred_proba),
    'Random Forest': (best_rf_auc, rf_pred_proba),
    'ExtraTrees': (best_et_auc, et_pred_proba),
    'Ensemble': (best_ens_auc, ensemble_proba),
    'KNN': (knn_auc, knn_pred_proba)
}

comparison_data = []
for name, (auc, proba) in models_data.items():
    metrics = calc_metrics(y_test, proba, test_weights)
    comparison_data.append({
        'Model': name, 'AUC': auc, 'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision'], 'Recall': metrics['Recall'], 'F1': metrics['F1']
    })

comparison = pd.DataFrame(comparison_data)
comparison['Composite'] = 0.6 * comparison['AUC'] + 0.4 * comparison['F1']

best_row = comparison.loc[comparison['Composite'].idxmax()]
best_model_name = best_row['Model']

model_to_proba = {
    'Logistic Regression': lr_pred_proba, 'XGBoost': xgb_pred_proba,
    'Random Forest': rf_pred_proba, 'ExtraTrees': et_pred_proba,
    'Ensemble': ensemble_proba, 'KNN': knn_pred_proba
}
best_pred_proba = model_to_proba[best_model_name]

# ============================================================================
# RISK STRATIFICATION
# ============================================================================

low_th = np.percentile(best_pred_proba, 20)
high_th = np.percentile(best_pred_proba, 60)
risk_tiers = np.where(best_pred_proba < low_th, 'Low',
                     np.where(best_pred_proba < high_th, 'Moderate', 'High'))

print(f"Thresholds: Low < {low_th:.3f} < Moderate < {high_th:.3f} < High\n")

for tier in ['Low', 'Moderate', 'High']:
    mask = risk_tiers == tier
    count = mask.sum()
    pct = count / len(risk_tiers) * 100
    asthma_count = y_test[mask].sum()
    asthma_rate = y_test[mask].mean()
    print(f"{tier:10s}: {count:4d} samples ({pct:5.1f}%) | {asthma_count:3.0f} asthma cases ({asthma_rate:6.2%})")

low_rate = y_test[risk_tiers == 'Low'].mean()
high_rate = y_test[risk_tiers == 'High'].mean()
risk_ratio = high_rate / low_rate if low_rate > 0 else float('inf')
print(f"\nRisk Ratio (High/Low): {risk_ratio:.2f}x\n")

# ============================================================================
# SAVE RESULTS
# ============================================================================

joblib.dump(lr_model, 'logistic_regression_tuned.pkl')
joblib.dump(xgb_model, 'xgboost_tuned.pkl')
joblib.dump(rf_model, 'random_forest_tuned.pkl')
joblib.dump(et_model, 'extratrees_tuned.pkl')
joblib.dump(knn_model, 'knn_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')

with open('ensemble_info.pkl', 'wb') as f:
    pickle.dump({'weights': best_ens_weights, 'model_order': ['LR', 'XGB', 'RF', 'ET'], 
                 'best_auc': best_ens_auc}, f)

with open('best_model_info.pkl', 'wb') as f:
    pickle.dump({'best_model_name': best_model_name, 'best_auc': best_row['AUC'],
                 'best_f1': best_row['F1'], 'best_composite': best_row['Composite'],
                 'selection_method': 'composite_0.6auc_0.4f1',
                 'class_imbalance_strategy': 'class_weighting',
                 'imbalance_ratio': imbalance_ratio}, f)

X_train_fe.to_csv('X_train_engineered.csv', index=False)
X_test_fe.to_csv('X_test_engineered.csv', index=False)

results_df = pd.DataFrame({
    'y_true': y_test, 'lr_proba': lr_pred_proba, 'xgb_proba': xgb_pred_proba,
    'rf_proba': rf_pred_proba, 'et_proba': et_pred_proba, 'ensemble_proba': ensemble_proba,
    'knn_proba': knn_pred_proba, 'best_model_proba': best_pred_proba,
    'risk_tier': risk_tiers, 'test_weight': test_weights
})
results_df.to_csv('all_model_predictions.csv', index=False)
comparison.to_csv('model_comparison_final.csv', index=False)

# Feature importances
for model, name in [(lr_model, 'lr'), (xgb_model, 'xgb'), (rf_model, 'rf'), (et_model, 'et')]:
    if name == 'lr':
        imp = pd.DataFrame({'feature': X_train_fe.columns, 'coefficient': model.coef_[0],
                           'abs_coefficient': np.abs(model.coef_[0])}).sort_values('abs_coefficient', ascending=False)
    else:
        imp = pd.DataFrame({'feature': X_train_fe.columns, 
                           'importance': model.feature_importances_}).sort_values('importance', ascending=False)
    imp.to_csv(f'{name}_feature_importance.csv', index=False)

Data: 10188 train, 2548 test
Imbalance: 4.35:1 | Train asthma: 18.69% | Test asthma: 18.68%

Feature Engineering: 5 → 5 (0 domain + 0 polynomial)

Class Weighting: minority 2.68x, majority 0.61x

Training models...

MODEL COMPARISON (Selection: 0.6×AUC + 0.4×F1)
              Model      AUC  Accuracy  Precision   Recall       F1  Composite
Logistic Regression 0.630641  0.692863   0.265844 0.418342 0.325098   0.508424
           Ensemble 0.633196  0.340508   0.195668 0.877498 0.319984   0.507911
      Random Forest 0.630504  0.744298   0.299459 0.333050 0.315362   0.504448
         ExtraTrees 0.623395  0.735599   0.291379 0.345872 0.316296   0.500555
            XGBoost 0.628957  0.177623   0.176966 1.000000 0.300716   0.497660
                KNN 0.531881  0.818641   0.376014 0.038884 0.070480   0.347321

✓ Best: Logistic Regression | AUC: 0.6306 | F1: 0.3251 | Composite: 0.5084

RISK STRATIFICATION (Logistic Regression)
Thresholds: Low < 0.400 < Moderate < 0.487 < High

Low       :  4