In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, MultiTaskLassoCV, ElasticNet
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, StackingRegressor, VotingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, mean_squared_error, mean_absolute_error)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')


def load_and_preprocess_data(train_path, external_path=None):
    """
    加载训练集和外部验证集，执行统一预处理
    """
    # 训练集处理
    train_data = pd.read_excel(train_path)
    transfusion_columns = ["浓缩红细胞", "新鲜血浆", "单采血小板", "冷沉淀"]
    
    # 对数变换（处理零值）
    for col in transfusion_columns:
        train_data[col] = np.log1p(train_data[col])
    
    # 特征与目标变量拆分
    X_train = train_data.drop(columns=["是否输血"] + transfusion_columns)
    y_class_train = train_data["是否输血"]
    y_reg_train = train_data[transfusion_columns]
    
    # 外部验证集处理
    X_external, y_class_external, y_reg_external = None, None, None
    if external_path:
        external_data = pd.read_excel(external_path)
        for col in transfusion_columns:
            external_data[col] = np.log1p(external_data[col])
        
        X_external = external_data.drop(columns=["是否输血"] + transfusion_columns)
        y_class_external = external_data["是否输血"]
        y_reg_external = external_data[transfusion_columns]
    
    return (X_train, y_class_train, y_reg_train, 
            X_external, y_class_external, y_reg_external, 
            transfusion_columns)


def two_stage_feature_selection(X_train, y_class_train, y_reg_train):
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    
    lasso_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, class_weight='balanced')
    lasso_clf.fit(X_train_scaled, y_class_train)
    clf_selector = SelectFromModel(lasso_clf, threshold=1e-5)
    selected_clf_feats = X_train.columns[clf_selector.get_support()]
    
    transfused_idx = y_class_train[y_class_train == 1].index
    X_reg_train_subset = X_train.loc[transfused_idx]
    scaler_reg = StandardScaler().fit(X_reg_train_subset)
    X_reg_scaled = scaler_reg.transform(X_reg_train_subset)
    
    mt_lasso = MultiTaskLassoCV(cv=5, max_iter=10000, random_state=42)
    mt_lasso.fit(X_reg_scaled, y_reg_train.loc[transfused_idx])
    reg_selector = SelectFromModel(mt_lasso, threshold=1e-5)
    selected_reg_feats = X_train.columns[reg_selector.get_support()]
    
    print(f"分类特征({len(selected_clf_feats)}): {selected_clf_feats.tolist()}")
    print(f"回归特征({len(selected_reg_feats)}): {selected_reg_feats.tolist()}")
    return clf_selector, reg_selector, scaler, scaler_reg

def build_hgbve_model():
    return VotingClassifier(
        estimators=[
            ('catboost', CatBoostClassifier(
                iterations=1000, depth=8, learning_rate=0.015, 
                l2_leaf_reg=5.0, random_seed=42, verbose=0
            )),
            ('xgb', XGBClassifier(
                n_estimators=500, max_depth=6, learning_rate=0.02,
                subsample=0.8, random_state=42, use_label_encoder=False
            ))
        ],
        voting='soft', weights=[0.6, 0.4]
    )

def build_tsr_model():
    return StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)),
            ('xgb', XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.01, subsample=0.8))
        ],
        final_estimator=ElasticNet(alpha=0.05, l1_ratio=0.7)
    )
def dynamic_task_activation(class_proba, reg_model, X, threshold=0.5):
    y_reg_pred = np.zeros((X.shape[0], reg_model.n_outputs_))
    transfusion_indices = np.where(class_proba >= threshold)[0]
    if len(transfusion_indices) > 0:
        y_reg_pred[transfusion_indices] = reg_model.predict(X[transfusion_indices])
    return y_reg_pred

def evaluate_model(model_clf, model_reg, X, y_class, y_reg, clf_selector, reg_selector, scaler, scaler_reg, trans_cols):
    X_clf = clf_selector.transform(scaler.transform(X))
    X_reg = reg_selector.transform(scaler_reg.transform(X))

    y_proba = model_clf.predict_proba(X_clf)[:,1]
    y_class_pred = (y_proba >= 0.5).astype(int)
    y_reg_pred = dynamic_task_activation(y_proba, model_reg, X_reg)

    metrics = {
        "accuracy": accuracy_score(y_class, y_class_pred),
        "f1": f1_score(y_class, y_class_pred),
        "roc_auc": roc_auc_score(y_class, y_proba),
        "reg_mse": [mean_squared_error(y_reg.iloc[:,i], y_reg_pred[:,i]) for i in range(4)]
    }
    
    # 打印结果
    print("\n===== 验证结果 =====")
    print(f"分类准确率: {metrics['accuracy']:.4f}")
    print(f"F1分数: {metrics['f1']:.4f}")
    print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
    for i, col in enumerate(trans_cols):
        print(f"{col} MSE: {metrics['reg_mse'][i]:.4f}")
    
    return metrics


def main():
    # 加载数据（假设外部验证集文件名为 external_validation_data.xlsx）
    (X_train, yc_train, yr_train, 
     X_external, yc_external, yr_external, 
     trans_cols) = load_and_preprocess_data(
        train_path="complete_data.xlsx",
        external_path="external_validation_data.xlsx"  # 外部验证集路径
    )

    clf_selector, reg_selector, scaler, scaler_reg = two_stage_feature_selection(X_train, yc_train, yr_train)
    
    X_train_clf = clf_selector.transform(scaler.transform(X_train))
    smote = SMOTE(random_state=42)
    X_train_clf_smote, yc_train_smote = smote.fit_resample(X_train_clf, yc_train)
    
    # 模型训练
    model_clf = build_hgbve_model()
    model_clf.fit(X_train_clf_smote, yc_train_smote)
    
    model_reg = build_tsr_model()
    X_train_reg = reg_selector.transform(scaler_reg.transform(X_train))
    model_reg.fit(X_train_reg, yr_train)
    
    # 内部验证（原测试集）
    print("\n【内部验证集评估】")
    X_train, X_test, yc_train, yc_test, yr_train, yr_test = train_test_split(
        X_train, yc_train, yr_train, test_size=0.2, stratify=yc_train, random_state=42
    )
    evaluate_model(model_clf, model_reg, X_test, yc_test, yr_test, clf_selector, reg_selector, scaler, scaler_reg, trans_cols)
    
    # 外部验证（若提供）
    if X_external is not None:
        print("\n【外部验证集评估】")
        evaluate_model(model_clf, model_reg, X_external, yc_external, yr_external, clf_selector, reg_selector, scaler, scaler_reg, trans_cols)

if __name__ == "__main__":
    main()