In [None]:
# 多任务学习(MTL)框架：输血需求分类与血液成分剂量预测
# 实现两阶段层次特征选择+混合梯度提升分类器+两阶段堆叠回归器
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, MultiTaskLassoCV, ElasticNet
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, StackingRegressor, VotingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, mean_squared_error, mean_absolute_error)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
import optuna  # 贝叶斯优化库
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path):
    data = pd.read_excel(file_path)
    # 对输血量进行对数变换（处理零值问题）
    transfusion_columns = ["浓缩红细胞", "新鲜血浆"]
    for col in transfusion_columns:
        data[col] = np.log1p(data[col])
    
    # 特征与目标变量拆分
    X = data.drop(columns=["是否输血"] + transfusion_columns)
    y_class = data["是否输血"]  # 分类目标：是否需要输血
    y_reg = data[transfusion_columns]  # 回归目标：各血液成分剂量
    
    return X, y_class, y_reg, transfusion_columns

def two_stage_feature_selection(X_train, y_class_train, y_reg_train):
    """
    阶段1：Lasso-Logistic分类特征选择
    阶段2：MultiTaskLassoCV回归特征选择（仅针对输血患者）
    """
    # 标准化
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    
    # 阶段1：分类任务特征选择（Lasso正则化）
    lasso_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, class_weight='balanced')
    lasso_clf.fit(X_train_scaled, y_class_train)
    clf_selector = SelectFromModel(lasso_clf, threshold=1e-5)
    X_train_clf = clf_selector.transform(X_train_scaled)
    selected_clf_feats = X_train.columns[clf_selector.get_support()]
    
    # 阶段2：回归任务特征选择（MultiTaskLassoCV，仅对输血患者）
    transfused_idx = y_class_train[y_class_train == 1].index
    X_reg_train_subset = X_train.loc[transfused_idx]
    scaler_reg = StandardScaler().fit(X_reg_train_subset)
    X_reg_scaled = scaler_reg.transform(X_reg_train_subset)
    
    mt_lasso = MultiTaskLassoCV(cv=5, max_iter=10000, random_state=42)
    mt_lasso.fit(X_reg_scaled, y_reg_train.loc[transfused_idx])
    reg_selector = SelectFromModel(mt_lasso, threshold=1e-5)
    selected_reg_feats = X_train.columns[reg_selector.get_support()]
    
    print(f"分类特征({len(selected_clf_feats)}): {selected_clf_feats.tolist()}")
    print(f"回归特征({len(selected_reg_feats)}): {selected_reg_feats.tolist()}")
    return clf_selector, reg_selector, scaler, scaler_reg

def build_hgbve_model():
    """混合梯度提升投票集成（CatBoost + XGBoost）"""
    catboost = CatBoostClassifier(
        iterations=1000, depth=8, learning_rate=0.015, 
        l2_leaf_reg=5.0, random_seed=42, verbose=0
    )
    xgboost = XGBClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.02,
        subsample=0.8, random_state=42, use_label_encoder=False
    )
    return VotingClassifier(
        estimators=[('catboost', catboost), ('xgb', xgboost)],
        voting='soft', weights=[0.6, 0.4]  # 动态软投票权重
    )

def build_tsr_model():
    """两阶段堆叠回归器（RF + XGBoost -> 弹性网络）"""
    base_models = [
        ('rf', RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.01, subsample=0.8))
    ]
    return StackingRegressor(
        estimators=base_models,
        final_estimator=ElasticNet(alpha=0.05, l1_ratio=0.7)  # 弹性网络元学习器
    )

def dynamic_task_activation(class_proba, reg_model, X, threshold=0.5):
    """动态任务激活：分类概率≥0.5时执行回归预测"""
    y_reg_pred = np.zeros((X.shape[0], reg_model.n_outputs_))
   输血_indices = np.where(class_proba >= threshold)[0]
    if len(输血_indices) > 0:
        y_reg_pred[输血_indices] = reg_model.predict(X[输血_indices])
    return y_reg_pred

def joint_loss(y_class_true, y_class_pred, y_reg_true, y_reg_pred, gamma=2.0, w=[1,1,0.5,0.5]):
    """双任务联合损失：Ltotal = γ·Lclass + Σwi·Lreg,i + λ||Ω||²"""
    class_loss = 1 - f1_score(y_class_true, y_class_pred)  # 分类损失（F1分数）
    reg_loss = sum(w[i] * mean_squared_error(y_reg_true[:,i], y_reg_pred[:,i]) for i in range(4))
    return gamma * class_loss + reg_loss  


def main():
    # 数据加载
    X, y_class, y_reg, trans_cols = load_and_preprocess_data("complete_data.xlsx")
    
    # 分层抽样划分数据集（8:2）
    X_train, X_test, yc_train, yc_test, yr_train, yr_test = train_test_split(
        X, y_class, y_reg, test_size=0.2, stratify=y_class, random_state=42
    )
    
    # 特征选择
    clf_selector, reg_selector, scaler, scaler_reg = two_stage_feature_selection(X_train, yc_train, yr_train)
    
    # 数据增强（SMOTE）
    X_train_clf = clf_selector.transform(scaler.transform(X_train))
    smote = SMOTE(random_state=42)
    X_train_clf_smote, yc_train_smote = smote.fit_resample(X_train_clf, yc_train)
    
    # 模型训练
    hgbve = build_hgbve_model()
    hgbve.fit(X_train_clf_smote, yc_train_smote)
    
    tsr = build_tsr_model()
    X_reg_train = reg_selector.transform(scaler_reg.transform(X_train))
    tsr.fit(X_reg_train, yr_train)
    
    # 评估
    X_test_clf = clf_selector.transform(scaler.transform(X_test))
    yc_proba = hgbve.predict_proba(X_test_clf)[:,1]
    yc_pred = (yc_proba >= 0.5).astype(int)
    yr_pred = dynamic_task_activation(yc_proba, tsr, reg_selector.transform(scaler_reg.transform(X_test)))
    
    # 输出结果
    print("\n分类任务指标:")
    print(f"准确率: {accuracy_score(yc_test, yc_pred):.4f}")
    print(f"F1分数: {f1_score(yc_test, yc_pred):.4f}")
    print(f"ROC-AUC: {roc_auc_score(yc_test, yc_proba):.4f}")
    
    print("\n回归任务指标:")
    for i, col in enumerate(trans_cols):
        print(f"{col} MSE: {mean_squared_error(yr_test.iloc[:,i], yr_pred[:,i]):.4f}")
    
    print(f"\n联合损失: {joint_loss(yc_test, yc_pred, yr_test.values, yr_pred):.4f}")

if __name__ == "__main__":
    main()

In [None]:
# 多任务学习框架对比实验：RF/XGBoost/MLP/BP算法性能评估
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, MultiTaskLassoCV, ElasticNet
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, StackingRegressor, VotingClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor  # 新增MLP/BP神经网络
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, mean_squared_error, mean_absolute_error)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path):
    data = pd.read_excel(file_path)
    transfusion_columns = ["浓缩红细胞", "新鲜血浆"]
    for col in transfusion_columns:
        data[col] = np.log1p(data[col])
    
    X = data.drop(columns=["是否输血"] + transfusion_columns)
    y_class = data["是否输血"]
    y_reg = data[transfusion_columns]
    
    return X, y_class, y_reg, transfusion_columns

def two_stage_feature_selection(X_train, y_class_train, y_reg_train):
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    lasso_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, class_weight='balanced')
    lasso_clf.fit(X_train_scaled, y_class_train)
    clf_selector = SelectFromModel(lasso_clf, threshold=1e-5)
    selected_clf_feats = X_train.columns[clf_selector.get_support()]

    transfused_idx = y_class_train[y_class_train == 1].index
    X_reg_train_subset = X_train.loc[transfused_idx]
    scaler_reg = StandardScaler().fit(X_reg_train_subset)
    X_reg_scaled = scaler_reg.transform(X_reg_train_subset)
    
    mt_lasso = MultiTaskLassoCV(cv=5, max_iter=10000, random_state=42)
    mt_lasso.fit(X_reg_scaled, y_reg_train.loc[transfused_idx])
    reg_selector = SelectFromModel(mt_lasso, threshold=1e-5)
    selected_reg_feats = X_train.columns[reg_selector.get_support()]
    
    print(f"分类特征({len(selected_clf_feats)}): {selected_clf_feats.tolist()}")
    print(f"回归特征({len(selected_reg_feats)}): {selected_reg_feats.tolist()}")
    return clf_selector, reg_selector, scaler, scaler_reg

def dynamic_task_activation(class_proba, reg_model, X, threshold=0.5):
    y_reg_pred = np.zeros((X.shape[0], reg_model.n_outputs_))
    transfusion_indices = np.where(class_proba >= threshold)[0]  
    if len(transfusion_indices) > 0:
        y_reg_pred[transfusion_indices] = reg_model.predict(X[transfusion_indices])
    return y_reg_pred

def get_comparison_models():
    return {
        # 1. 随机森林（RF）
        "RF": {
            "classifier": RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42),
            "regressor": RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
        },
        # 2. XGBoost
        "XGBoost": {
            "classifier": XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.01, random_state=42),
            "regressor": XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.01, random_state=42)
        },
        # 3. 多层感知器（MLP）- 深层网络
        "MLP": {
            "classifier": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, learning_rate_init=0.001, random_state=42),
            "regressor": MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=1000, learning_rate_init=0.001, random_state=42)
        },
        # 4. BP神经网络（浅层网络，与MLP区分）
        "BP": {
            "classifier": MLPClassifier(hidden_layer_sizes=(32,), max_iter=1000, learning_rate_init=0.01, random_state=42),
            "regressor": MLPRegressor(hidden_layer_sizes=(32,), max_iter=1000, learning_rate_init=0.01, random_state=42)
        }
    }


def evaluate_comparison_models(models, X_train, X_test, yc_train, yc_test, yr_train, yr_test, 
                              clf_selector, reg_selector, scaler, scaler_reg, trans_cols):
    results = []
    
    X_train_clf = clf_selector.transform(scaler.transform(X_train))
    X_test_clf = clf_selector.transform(scaler.transform(X_test))
    smote = SMOTE(random_state=42)
    X_train_clf_smote, yc_train_smote = smote.fit_resample(X_train_clf, yc_train)
    
    X_train_reg = reg_selector.transform(scaler_reg.transform(X_train))
    X_test_reg = reg_selector.transform(scaler_reg.transform(X_test))
    
    print("\n===== 原模型（HGBVE+TSR）评估 =====")
    hgbve = VotingClassifier(
        estimators=[
            ('catboost', CatBoostClassifier(iterations=1000, depth=8, learning_rate=0.015, random_seed=42, verbose=0)),
            ('xgb', XGBClassifier(n_estimators=500, max_depth=6, random_state=42))
        ],
        voting='soft', weights=[0.6, 0.4]
    )
    tsr = StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)),
            ('xgb', XGBRegressor(n_estimators=300, max_depth=6, random_state=42))
        ],
        final_estimator=ElasticNet(alpha=0.05, l1_ratio=0.7)
    )
    hgbve.fit(X_train_clf_smote, yc_train_smote)
    tsr.fit(X_train_reg, yr_train)
    
    yc_proba = hgbve.predict_proba(X_test_clf)[:,1]
    yc_pred = (yc_proba >= 0.5).astype(int)
    yr_pred = dynamic_task_activation(yc_proba, tsr, X_test_reg)

    results.append({
        "model": "HGBVE+TSR",
        "acc": accuracy_score(yc_test, yc_pred),
        "f1": f1_score(yc_test, yc_pred),
        "auc": roc_auc_score(yc_test, yc_proba),
        "mse_rbc": mean_squared_error(yr_test.iloc[:,0], yr_pred[:,0]),  
        "mse_plasma": mean_squared_error(yr_test.iloc[:,1], yr_pred[:,1])  
    })
    
    for name, model in models.items():
        print(f"\n===== {name}模型评估 =====")
        clf = model["classifier"]
        clf.fit(X_train_clf_smote, yc_train_smote)
        reg = model["regressor"]
        reg.fit(X_train_reg, yr_train)
        
        yc_proba = clf.predict_proba(X_test_clf)[:,1]
        yc_pred = (yc_proba >= 0.5).astype(int)
        yr_pred = dynamic_task_activation(yc_proba, reg, X_test_reg)
        
        results.append({
            "model": name,
            "acc": accuracy_score(yc_test, yc_pred),
            "f1": f1_score(yc_test, yc_pred),
            "auc": roc_auc_score(yc_test, yc_proba),
            "mse_rbc": mean_squared_error(yr_test.iloc[:,0], yr_pred[:,0]),
            "mse_plasma": mean_squared_error(yr_test.iloc[:,1], yr_pred[:,1])
        })
    
    # 4. 输出对比表格
    print("\n===== 算法性能对比 =====")
    return pd.DataFrame(results).round(4)

def main():
    X, y_class, y_reg, trans_cols = load_and_preprocess_data("complete_data.xlsx")
    X_train, X_test, yc_train, yc_test, yr_train, yr_test = train_test_split(
        X, y_class, y_reg, test_size=0.2, stratify=y_class, random_state=42
    )
    clf_selector, reg_selector, scaler, scaler_reg = two_stage_feature_selection(X_train, yc_train, yr_train)

    comparison_models = get_comparison_models()
    performance_df = evaluate_comparison_models(
        models=comparison_models,
        X_train=X_train, X_test=X_test,
        yc_train=yc_train, yc_test=yc_test,
        yr_train=yr_train, yr_test=yr_test,
        clf_selector=clf_selector, reg_selector=reg_selector,
        scaler=scaler, scaler_reg=scaler_reg,
        trans_cols=trans_cols
    )
    print(performance_df.to_string(index=False))

if __name__ == "__main__":
    main()