In [None]:
#定量数据的正态性检验

import pandas as pd
from scipy import stats
import numpy as np

# 读取Excel文件
file_path = 'data.xlsx'
data = pd.read_excel(file_path)

# 定义要分析的连续变量列表
continuous_vars = ['Age', 'BMI', 'AIP', 'TyG', 'ePWV', 'SII', '24h-UP', 'eGFR']

# 创建结果DataFrame
results = pd.DataFrame(columns=['Variable', 'Shapiro-Wilk Statistic', 'p-value', 
                               'Normal Distribution', 'Skewness', 'Kurtosis'])

# 对每个连续变量进行分析
for var in continuous_vars:
    # 移除缺失值
    clean_data = data[var].dropna()
    
    # Shapiro-Wilk检验
    shapiro_stat, shapiro_p = stats.shapiro(clean_data)
    
    # 计算偏度和峰度
    skewness = stats.skew(clean_data)
    kurtosis = stats.kurtosis(clean_data)
    
    # 判断是否服从正态分布（α=0.05）
    is_normal = shapiro_p > 0.05
    
    # 将结果添加到DataFrame中
    results.loc[len(results)] = {
        'Variable': var,
        'Shapiro-Wilk Statistic': shapiro_stat,
        'p-value': f"{shapiro_p:.4f}",  # 修改这里，将p值格式化为4位小数
        'Normal Distribution': is_normal,
        'Skewness': skewness,
        'Kurtosis': kurtosis
    }

# 打印结果
print("正态性检验及描述统计结果:")
print(results)

In [None]:
#方差齐性检验

import pandas as pd
from scipy.stats import levene, bartlett
import numpy as np

# 1. 读取数据
file_path = 'data.xlsx'
data = pd.read_excel(file_path)

# 2. 定义连续变量和分组变量
continuous_vars = ['Age', 'BMI', 'ePWV', 'AIP', 'TyG']
group_var = '1yearegfr'  # 分组变量

# 3. 检查数据
print("数据前几行：")
print(data[continuous_vars + [group_var]].head())

print("\n分组情况：")
print(data[group_var].value_counts())

print("\n数据描述统计：")
print(data[continuous_vars].describe())

# 4. 进行方差齐性检验
print("\n方差齐性检验结果（按{}分组）：".format(group_var))

for var in continuous_vars:
    # 去除缺失值
    df_clean = data[[var, group_var]].dropna()
    
    # 获取分组类别
    groups = df_clean[group_var].unique()
    
    # 准备各组数据
    group_data = [df_clean[df_clean[group_var] == g][var] for g in groups]
    
    # 检查至少有两组数据
    if len(groups) < 2:
        print(f"\n变量 {var} 只有 {len(groups)} 个组，无法进行方差齐性检验")
        continue
    
    # 执行Levene检验（对非正态分布数据更稳健）
    stat_levene, p_levene = levene(*group_data)
    
    # 执行Bartlett检验（要求数据服从正态分布）
    try:
        stat_bartlett, p_bartlett = bartlett(*group_data)
    except Exception as e:
        stat_bartlett, p_bartlett = np.nan, np.nan
        print(f"对变量 {var} 进行Bartlett检验时出错: {str(e)}")
    
    # 打印结果
    print(f"\n变量: {var}")
    print(f"组别: {groups}")
    print(f"Levene检验 - 统计量: {stat_levene:.4f}, p值: {p_levene:.4f}")
    
    # 结果解释
    alpha = 0.05
    if p_levene > alpha:
        print(f"Levene检验: p值 > {alpha}，方差齐")
    else:
        print(f"Levene检验: p值 ≤ {alpha}，方差不齐")
 

In [None]:
# 各种检验
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import mannwhitneyu
import researchpy as rp
import openpyxl

# 设置中文显示（如果需要）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 1. 读取数据
data = pd.read_excel('data.xlsx')

# 检查分组变量
if '1yearegfr' not in data.columns:
    raise ValueError("分组变量 '1yearegfr' 不在数据中")

# 2. 定义变量类型
categorical_vars = ['Gender', 'K-W_nodules', 'Crescent-shaped_changes', 'Capillary_microaneurysms', 'Hyaline_droplets', 
                    'Cap_lesion', 'Tubular_atrophy', 'Interstitial_fibrosis', 'Arteriolar_hyalinosis', 'Mesangial_expansion']
continuous_vars = {
    'parametric': ['Age', 'BMI', 'AIP', 'TyG', 'ePWV'],
    'nonparametric': ['SII', 'eGFR', '24h-UP']
}

# 3. 创建结果存储字典
results = {
    'Variable': [],
    'Overall_Mean/Count': [],
    'Overall_%': [],
    'Group0_Mean/Count': [],
    'Group0_%': [],
    'Group1_Mean/Count': [],
    'Group1_%': [],
    'p_value': [],
    'Test': []
}

# 4. 执行统计检验
# 获取分组
group0 = data[data['1yearegfr'] == 0]
group1 = data[data['1yearegfr'] == 1]

# 分类变量 - 卡方检验
for var in categorical_vars:
    crosstab = pd.crosstab(data[var], data['1yearegfr'])
    chi2, p, dof, expected = stats.chi2_contingency(crosstab)
    
    # 计算百分比
    perc_overall = (data[var].value_counts(normalize=True) * 100).round(1)
    perc_group0 = (group0[var].value_counts(normalize=True) * 100)
    perc_group1 = (group1[var].value_counts(normalize=True) * 100)
    
    for category in crosstab.index:
        results['Variable'].append(f"{var}_{category}")
        results['Overall_Mean/Count'].append(data[data[var] == category].shape[0])
        results['Overall_%'].append(f"{perc_overall.get(category, 0):.1f}%")
        results['Group0_Mean/Count'].append(crosstab.loc[category, 0])
        results['Group0_%'].append(f"{perc_group0.get(category, 0):.1f}%")
        results['Group1_Mean/Count'].append(crosstab.loc[category, 1])
        results['Group1_%'].append(f"{perc_group1.get(category, 0):.1f}%")
        results['p_value'].append(p)  # 所有行填充相同的 p 值
        results['Test'].append("Chi-square")  # 所有行填充相同的检验方法
    
    # 执行卡方检验
    chi2, p, dof, expected = stats.chi2_contingency(crosstab)
    
    # 存储结果
    for category in crosstab.index:
        results['Variable'].append(f"{var}_{category}")
        results['Overall_Mean/Count'].append(data[data[var] == category].shape[0])
        results['Overall_%'].append(f"{perc_overall.get(category, 0):.1f}%")
        results['Group0_Mean/Count'].append(crosstab.loc[category, 0])
        results['Group0_%'].append(f"{perc_group0.get(category, 0):.1f}%")
        results['Group1_Mean/Count'].append(crosstab.loc[category, 1])
        results['Group1_%'].append(f"{perc_group1.get(category, 0):.1f}%")
        results['p_value'].append(p if category == crosstab.index[0] else np.nan)  # 只在第一行显示p值
        results['Test'].append("Chi-square" if category == crosstab.index[0] else "")

# 连续变量 - 独立样本t检验
for var in continuous_vars['parametric']:
    # 计算总体和各组的均值和标准差
    mean_overall, std_overall = data[var].mean(), data[var].std()
    mean0, std0 = group0[var].mean(), group0[var].std()
    mean1, std1 = group1[var].mean(), group1[var].std()
    
    # 执行t检验
    t, p = stats.ttest_ind(group0[var].dropna(), group1[var].dropna())
    
    # 存储结果
    results['Variable'].append(var)
    results['Overall_Mean/Count'].append(f"{mean_overall:.2f} ± {std_overall:.2f}")
    results['Overall_%'].append(np.nan)
    results['Group0_Mean/Count'].append(f"{mean0:.2f} ± {std0:.2f}")
    results['Group0_%'].append(np.nan)
    results['Group1_Mean/Count'].append(f"{mean1:.2f} ± {std1:.2f}")
    results['Group1_%'].append(np.nan)
    results['p_value'].append(p)
    results['Test'].append("Student's t-test")

# 非参数变量 - Mann-Whitney U 检验
for var in continuous_vars['nonparametric']:
    # 计算中位数和四分位数
    median = data[var].median()
    q1 = data[var].quantile(0.25)
    q3 = data[var].quantile(0.75)
    
    # 各组计算
    median0, q10, q30 = group0[var].median(), group0[var].quantile(0.25), group0[var].quantile(0.75)
    median1, q11, q31 = group1[var].median(), group1[var].quantile(0.25), group1[var].quantile(0.75)
    
    # Mann-Whitney U 检验
    u, p = mannwhitneyu(group0[var].dropna(), group1[var].dropna())
    
    # 存储结果
    results['Variable'].append(var)
    results['Overall_Mean/Count'].append(f"{median:.2f} ({q1:.2f}–{q3:.2f})")
    results['Overall_%'].append(np.nan)
    results['Group0_Mean/Count'].append(f"{median0:.2f} ({q10:.2f}–{q30:.2f})")
    results['Group0_%'].append(np.nan)
    results['Group1_Mean/Count'].append(f"{median1:.2f} ({q11:.2f}–{q31:.2f})")
    results['Group1_%'].append(np.nan)
    results['p_value'].append(p)
    results['Test'].append("Mann-Whitney U")

# 5. 创建结果DataFrame
results_df = pd.DataFrame(results)

# 6. 添加样本量信息到表头
results_df.columns = [
    'Variable', 
    f'Overall (n={len(data)})_Mean/Count', 
    f'Overall (n={len(data)})_%',
    f'1yearegfr=0 (n={len(group0)})_Mean/Count', 
    f'1yearegfr=0 (n={len(group0)})_%',
    f'1yearegfr=1 (n={len(group1)})_Mean/Count', 
    f'1yearegfr=1 (n={len(group1)})_%',
    'p_value', 
    'Test'
]

# 假设你的目标目录是 
output_dir = 'results'  # 替换为你想要的目录，如 'C:/results' 或 '/home/user/data'

# 确保目录存在（如果不存在则自动创建）
os.makedirs(output_dir, exist_ok=True)

# 拼接完整的文件路径
output_path = os.path.join(output_dir, '1_baseline.xlsx')

# 保存 DataFrame 到指定路径
results_df.to_excel(output_path, index=False)

print(f"结果已保存至: {output_path}")

print("分析完成！结果已保存为Excel文件。")