In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve, average_precision_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectFromModel
import matplotlib as mpl
import warnings
import os
from sklearn.calibration import calibration_curve, CalibrationDisplay

# 设置输出目录
output_dir = r'results\3_LASSO'
os.makedirs(output_dir, exist_ok=True)

# 设置中文显示和期刊通用样式
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore')

mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.size'] = 10
mpl.rcParams['legend.fontsize'] = 8
mpl.rcParams['axes.titlesize'] = 10
mpl.rcParams['axes.labelsize'] = 9
mpl.rcParams['xtick.labelsize'] = 8
mpl.rcParams['ytick.labelsize'] = 8
mpl.rcParams['figure.figsize'] = (6, 4)
mpl.rcParams['savefig.bbox'] = 'tight'
mpl.rcParams['savefig.pad_inches'] = 0.1
mpl.rcParams['savefig.directory'] = output_dir

# 1. 读取数据
data = pd.read_excel('data.xlsx')

# 2. 数据质量检查
print("\n数据质量检查:")
print(f"总样本数: {len(data)}")
print(f"缺失值统计:\n{data.isnull().sum()}")

# 3. 准备变量
y = data['1yearegfr']
categorical_vars = ['Crescent-shaped_changes', 'Interstitial_fibrosis', "K-W_nodules"]
continuous_vars = [ 'ePWV', 'SII', '24h-UP', 'eGFR']

# 4. 数据预处理
X_categorical = pd.get_dummies(data[categorical_vars], drop_first=True)
scaler = StandardScaler()
X_continuous = pd.DataFrame(scaler.fit_transform(data[continuous_vars]), columns=continuous_vars)
X = pd.concat([X_continuous, X_categorical], axis=1)

# 5. 计算VIF值
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

vif_results = calculate_vif(X)
print("\nVIF值计算结果:")
print(vif_results)

# 6. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7. 建立Lasso Logistic回归模型
alphas = np.logspace(-6, 6, 200)
Cs = 1.0 / alphas
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lasso_cv = LogisticRegressionCV(
    Cs=Cs,
    penalty='l1',
    solver='liblinear',
    cv=cv,
    random_state=42,
    max_iter=10000,
    scoring='neg_log_loss',   # 关键修改
    class_weight='balanced',
    n_jobs=-1                 # 多核加速（可选）
)
lasso_cv.fit(X_train, y_train)

# 8. 获取最佳参数
best_C = lasso_cv.C_[0]
best_alpha = 1.0 / best_C
print(f"\n最佳C值: {best_C}")
print(f"最佳alpha值: {best_alpha}")

best_lasso = lasso_cv

# 9. 打印最终入选的变量
selected_features = X.columns[best_lasso.coef_[0] != 0]
print(f"\n最终入选的变量 (共{len(selected_features)}个):")
for i, feature in enumerate(selected_features, 1):
    coef_value = best_lasso.coef_[0][X.columns.get_loc(feature)]
    print(f"{i}. {feature}: 系数 = {coef_value:.4f}, OR = {np.exp(coef_value):.4f}")

# 10. 模型评估
y_pred_prob = best_lasso.predict_proba(X_test)[:, 1]
y_pred = best_lasso.predict(X_test)

auc = roc_auc_score(y_test, y_pred_prob)
ap = average_precision_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)

print(f"\n测试集AUC值: {auc:.3f}")
print(f"测试集平均精确率(AP): {ap:.3f}")
print("\n混淆矩阵:")
print(conf_matrix)
print("\n分类报告:")
print(pd.DataFrame(class_report).transpose())

# 11. 获取系数
coef_df = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': best_lasso.coef_[0],
    'Absolute_Coefficient': np.abs(best_lasso.coef_[0]),
    'Odds_Ratio': np.exp(best_lasso.coef_[0])
}).sort_values('Absolute_Coefficient', ascending=False)

# 14. 保存结果到Excel
output_excel = os.path.join(output_dir, 'lasso_logistic_results.xlsx')
with pd.ExcelWriter(output_excel) as writer:
    vif_results.to_excel(writer, sheet_name='VIF Results', index=False)
    coef_df.to_excel(writer, sheet_name='Coefficients', index=False)
    
    # 添加入选变量表
    selected_coef_df = coef_df[coef_df['Coefficient'] != 0].sort_values('Absolute_Coefficient', ascending=False)
    selected_coef_df.to_excel(writer, sheet_name='Selected Variables', index=False)

    eval_df = pd.DataFrame({
        'Metric': ['Best Alpha', 'Best C', 'Test AUC', 'Average Precision'],
        'Value': [best_alpha, best_C, auc, ap]
    })
    eval_df.to_excel(writer, sheet_name='Model Evaluation', index=False)

    pd.DataFrame(conf_matrix,
                 index=['Actual 0', 'Actual 1'],
                 columns=['Predicted 0', 'Predicted 1']).to_excel(writer, sheet_name='Confusion Matrix')

    pd.DataFrame(class_report).transpose().to_excel(writer, sheet_name='Classification Report')

# 15. 可视化
# --------------------------------------------------
# 统一把变量名中的下划线替换成空格，仅用于绘图
plot_labels = {c: c.replace('_', ' ') for c in X.columns}
# --------------------------------------------------

# 15.1 Lasso Path Plot
plt.figure()
colors = plt.cm.rainbow(np.linspace(0, 1, len(X.columns)))

coef_paths = []
reversed_alphas = alphas[::-1]
for alpha in reversed_alphas:
    model = LogisticRegression(
        penalty='l1',
        solver='liblinear',
        C=1.0/alpha,
        random_state=42,
        max_iter=10000,
        class_weight='balanced'
    )
    model.fit(X_train, y_train)
    coef_paths.append(model.coef_[0])
coef_paths = np.array(coef_paths)

for i, (coef, color) in enumerate(zip(coef_paths.T, colors)):
    label = plot_labels[X.columns[i]] if i < 10 else None
    plt.plot(np.log10(reversed_alphas), coef, color=color, label=label)

plt.axvline(np.log10(best_alpha), color='k', linestyle='--', label='Best alpha')
plt.xlabel('log10(alpha)')
plt.ylabel('Coefficients')
plt.title('Coefficient Path Plot for Lasso Logistic Regression')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'lasso_path_plot.png'))
plt.close()

# 15.2 交叉验证图
cv_scores = cross_val_score(
    best_lasso, X_train, y_train,
    cv=cv,
    scoring='neg_log_loss',   # 与上面一致
    n_jobs=-1
)

plt.figure()
plt.plot(range(1, 6), -cv_scores, marker='o')  # 注意取相反数（越大越好）
plt.axhline(-np.mean(cv_scores), color='r', linestyle='--',
            label=f'平均负对数损失: {-np.mean(cv_scores):.3f}')
plt.xlabel('交叉验证折数')
plt.ylabel('负对数损失（越小越好）')
plt.title('5 折交叉验证负对数损失')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'cross_validation_neglogloss.png'))
plt.close()

# 15.3 系数表格图
plt.figure()
top_coefs = coef_df.head(15).copy()
# 把变量名列替换成带空格的版本
top_coefs['Variable'] = top_coefs['Variable'].map(plot_labels)

sns.barplot(x='Coefficient', y='Variable', data=top_coefs, palette='viridis')
plt.title('Bar Plot of Coefficients from Lasso Logistic Regression')
plt.xlabel('Coefficients')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'coefficients_table.png'))
plt.close()

# 15.5 ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('ROC曲线')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'roc_curve.png'))
plt.close()

# 15.6 精确率-召回率曲线
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
plt.figure()
plt.plot(recall, precision, label=f'AP = {ap:.3f}')
plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('精确率-召回率曲线')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'precision_recall_curve.png'))
plt.close()

# 15.7 校准曲线
prob_true, prob_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
plt.figure()
plt.plot(prob_pred, prob_true, 's-', label='模型')
plt.plot([0, 1], [0, 1], 'k--', label='完美校准')
plt.xlabel('预测概率')
plt.ylabel('实际概率')
plt.title('校准曲线')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'calibration_curve.png'))
plt.close()

# 15.9 特征相关性热图
plt.figure()
corr_matrix = X_train.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('特征相关性热图')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'feature_correlation.png'))
plt.close()

print(f"\n分析完成! 所有结果已保存到指定目录: {output_dir}")