In [None]:
import os
import pickle
import numpy as np
from sklearn.metrics import roc_curve, auc
import xgboost as xgb
import matplotlib.pyplot as plt
import ehr_utils  # 确保这个模块可用

# 加载原始数据
X_train, X_test, y_train, y_test = ehr_utils.preprocess_ehr_train_test_data(
    'data_processed/benbu_baseline_cleaned_onehot.csv'
)

# 设置 XGBoost 参数
xgb_params = dict(
    colsample_bytree=1.0,
    device='gpu',
    eval_metric="auc",
    gamma=0.0,
    learning_rate=0.022403069086742198,
    max_depth=5,
    min_child_weight=100,
    n_estimators=589,
    n_jobs=-1,
    random_state=42,
    reg_alpha=0.14314863930500873,
    reg_lambda=100.0,
    subsample=0.7300248552604385
)

# 使用原始特征训练模型
model = xgb.XGBClassifier(**xgb_params)
model.fit(X_train, y_train)

# 预测概率
y_proba_train = model.predict_proba(X_train)[:, 1]
y_proba_test = model.predict_proba(X_test)[:, 1]

# 计算 ROC 曲线
fpr_train, tpr_train, _ = roc_curve(y_train, y_proba_train)
fpr_test, tpr_test, _ = roc_curve(y_test, y_proba_test)
auc_train = auc(fpr_train, tpr_train)
auc_test = auc(fpr_test, tpr_test)

# 绘图
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, linestyle='--', color='blue', label=f'Train AUC = {auc_train:.4f}')
plt.plot(fpr_test, tpr_test, color='red', label=f'Test AUC = {auc_test:.4f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Original Features Only)')
plt.legend()
plt.grid(True, linestyle=':', alpha=0.7)
plt.tight_layout()
plt.show()

print("✅ 仅使用原始特征的模型训练与评估完成！")