In [26]:
# 文件名: 1_preprocess_data.py
import pandas as pd
import numpy as np
import os

def preprocess_core_final(male_raw_df, female_raw_df):
    """对男胎和女胎数据执行最终的核心预处理操作。"""
    processed_dfs = {}
    for name, df in [('男胎', male_raw_df.copy()), ('女胎', female_raw_df.copy())]:
        df.drop(columns=['序号', '末次月经'], inplace=True, errors='ignore')
        df['检测日期'] = pd.to_datetime(df['检测日期'], format='%Y%m%d')
        group_keys = ['孕妇代码', '检测抽血次数']
        df['检测抽血次数'] = pd.to_numeric(df['检测抽血次数'], errors='coerce').astype('Int64')
        df.dropna(subset=group_keys, inplace=True)
        
        cols_to_average = [
            '身高', '体重', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', 
            '唯一比对的读段数', 'GC含量', '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 
            'X染色体的Z值', 'Y染色体的Z值', 'Y染色体浓度', 'X染色体浓度', '13号染色体的GC含量', 
            '18号染色体的GC含量', '21号染色体的GC含量', '被过滤掉读段数的比例'
        ]
        agg_dict = {col: 'first' for col in df.columns if col not in group_keys}
        for col in cols_to_average:
            if col in agg_dict:
                agg_dict[col] = 'mean'
        if '染色体的非整倍体' in df.columns:
             agg_dict['染色体的非整倍体'] = lambda s: s.dropna().unique()[0] if not s.dropna().empty else np.nan
        df = df.groupby(group_keys, as_index=False).agg(agg_dict)
        
        df.sort_values(by=['孕妇代码', '检测日期'], inplace=True)
        
        # 修复孕周提取逻辑 - 支持两种格式: 数字w+数字 和 数字w
        def extract_pregnancy_weeks(week_str):
            if pd.isna(week_str):
                return np.nan
            week_str = str(week_str)
            # 匹配 数字w+数字 格式 (如 22w+6)
            match1 = pd.Series(week_str).str.extract(r'(\d+)w\+(\d+)')
            if not match1.isna().any().any():
                return float(match1.iloc[0, 0]) + float(match1.iloc[0, 1]) / 7
            
            # 匹配 数字w 格式 (如 13w)
            match2 = pd.Series(week_str).str.extract(r'(\d+)w')
            if not match2.isna().any().any():
                return float(match2.iloc[0, 0])
            
            return np.nan
        
        df['孕周'] = df['检测孕周'].apply(extract_pregnancy_weeks)
        
        df['怀孕次数'] = df['怀孕次数'].replace({'>=3': '≥3'}).astype(str)
        dummies_preg = pd.get_dummies(df['怀孕次数'], prefix='怀孕次数')
        df = pd.concat([df, dummies_preg], axis=1)
        
        df.rename(columns={'IVF妊娠': '妊娠方式'}, inplace=True)
        dummies_ivf = pd.get_dummies(df['妊娠方式'], prefix='妊娠方式')
        df = pd.concat([df, dummies_ivf], axis=1)

        if '胎儿是否健康' in df.columns:
            df['胎儿是否健康'] = df['胎儿是否健康'].astype(str).map({'是': 1, '否': 0, '1.0':1, '0.0':0}).astype('Int64')
        
        df['计算序次'] = df.groupby('孕妇代码').cumcount() + 1
        df['检测抽血次数'] = df['检测抽血次数'].astype(int)
        if not df['检测抽血次数'].equals(df['计算序次']):
            df.drop(columns=['检测抽血次数'], inplace=True)
            df.rename(columns={'计算序次': '检测抽血次数'}, inplace=True)
        else:
            df.drop(columns=['计算序次'], inplace=True)

        df['距首次检测天数'] = (df['检测日期'] - df.groupby('孕妇代码')['检测日期'].transform('min')).dt.days

        if name == '男胎':
            df['Y浓度是否达标'] = (df['Y染色体浓度'] >= 0.04).astype(int)
        if name == '女胎':
            df['染色体的非整倍体'] = df['染色体的非整倍体'].fillna('正常')
            df['is_T13'] = df['染色体的非整倍体'].str.contains('13').astype(int)
            df['is_T18'] = df['染色体的非整倍体'].str.contains('18').astype(int)
            df['is_T21'] = df['染色体的非整倍体'].str.contains('21').astype(int)

        cols_to_round = df.select_dtypes(include='float64').columns
        df[cols_to_round] = df[cols_to_round].round(2)

        base_cols = ['孕妇代码', '检测抽血次数', '年龄', '身高', '体重', '孕妇BMI', '怀孕次数_1', '怀孕次数_2', '怀孕次数_≥3', '生产次数', '妊娠方式_IUI（人工授精）', '妊娠方式_IVF（试管婴儿）', '妊娠方式_自然受孕']
        test_info_cols = ['检测日期', '孕周', '距首次检测天数']
        concentration_cols = ['Y染色体浓度', 'X染色体浓度', 'Y浓度是否达标']
        z_score_cols = ['13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'Y染色体的Z值']
        quality_cols = ['原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数', 'GC含量', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量', '被过滤掉读段数的比例']
        outcome_cols = ['染色体的非整倍体', 'is_T13', 'is_T18', 'is_T21', '胎儿是否健康']
        
        final_col_order = base_cols + test_info_cols
        if name == '男胎':
            final_col_order += concentration_cols + z_score_cols
        else:
            final_col_order += ['X染色体浓度'] + [z for z in z_score_cols if 'Y' not in z]
        final_col_order += quality_cols + outcome_cols
        
        final_cols_exist = [col for col in final_col_order if col in df.columns]
        df = df[final_cols_exist]
        if name == '女胎':
            df.dropna(axis=1, how='all', inplace=True)
            
        processed_dfs[name] = df
    
    return processed_dfs.get('男胎'), processed_dfs.get('女胎')

if __name__ == '__main__':
    excel_file_path = '../../Data/附件.xlsx'
    RESULT_DIR = 'Result'
    os.makedirs(RESULT_DIR, exist_ok=True)
    
    male_df_raw = pd.read_excel(excel_file_path, sheet_name='男胎检测数据')
    female_df_raw = pd.read_excel(excel_file_path, sheet_name='女胎检测数据')
    
    male_df_processed, female_df_processed = preprocess_core_final(male_df_raw, female_df_raw)

    male_processed_path = os.path.join(RESULT_DIR, '男胎_预处理后数据.csv')
    female_processed_path = os.path.join(RESULT_DIR, '女胎_预处理后数据.csv')
    male_df_processed.to_csv(male_processed_path, index=False, encoding='utf-8-sig')
    female_df_processed.to_csv(female_processed_path, index=False, encoding='utf-8-sig')
    
    print(f"预处理完成，数据已保存至 '{RESULT_DIR}' 文件夹。")

预处理完成，数据已保存至 'Result' 文件夹。


In [27]:
# 文件名: 2_visualize_data_final_v2.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tabulate import tabulate

# --- 视觉风格定义 (全局统一) ---
# 1. 中文字体设置
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 2. 极致美观的配色方案
# 色系一: 深海宝石 (Deep Ocean Gem) - 适用于分布图
PALETTE_DEEP = ['#003f5c', '#2f4b7c', '#665191', '#a05195', '#d45087', '#f95d6a', '#ff7c43', '#ffa600']
# 色系二: 莫兰迪高级 (Morandi Premium) - 适用于分类对比
PALETTE_MORANDI = ['#8a9b8f', '#a6b1a8', '#c3cac3', '#8c7b6c', '#a69e93', '#bfb5a7']
# 色系三: 春日花园 (Spring Garden) - 适用于强调数据
PALETTE_SPRING = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#f9c74f', '#90be6d', '#43aa8b']
# 色系四: 高级珠宝 (Luxury Jewel) - 特殊强调色
PALETTE_JEWEL = ['#ffd166', '#06d6a0', '#118ab2', '#ef476f', '#073b4c']

# 3. 字体与布局设置
plt.rcParams.update({
    'figure.figsize': (12, 9), 'axes.grid': True, 'grid.linestyle': ':', 'grid.alpha': 0.3,
    'axes.facecolor': '#f8f9fa', 'axes.edgecolor': '#dee2e6', 'axes.labelcolor': '#2b2d42',
    'xtick.color': '#2b2d42', 'ytick.color': '#2b2d42', 'font.size': 22,
    'axes.labelsize': 24, 'xtick.labelsize': 20, 'ytick.labelsize': 20, 'legend.fontsize': 20,
    'figure.facecolor': '#ffffff', 'savefig.facecolor': '#ffffff',
})

def save_table_to_file(table_data, filename, title):
    """将表格保存为文本文件"""
    table_path = os.path.join(RESULT_DIR, filename)
    with open(table_path, 'w', encoding='utf-8') as f:
        f.write(f"{title}\n")
        f.write("=" * 50 + "\n\n")
        f.write(table_data)
        f.write("\n\n" + "=" * 50)
    print(f"表格已保存: {table_path}")

def run_final_visualization():
    """加载已处理数据并执行所有美化后的可视化操作。"""
    print("==============================================================")
    print("           开始执行极致美观版数据可视化流程")
    print("==============================================================")
    
    # --- 加载数据 ---
    male_df = pd.read_csv(MALE_PROCESSED_PATH)
    female_df = pd.read_csv(FEMALE_PROCESSED_PATH)
    male_df_raw = pd.read_excel(RAW_EXCEL_PATH, sheet_name='男胎检测数据')
    
    # --- 表格输出 ---
    # 1. 妊娠方式统计表格 (输出为文件)
    preg_methods_cols = ['妊娠方式_自然受孕', '妊娠方式_IUI（人工授精）', '妊娠方式_IVF（试管婴儿）']
    existing_preg_methods = [col for col in preg_methods_cols if col in male_df.columns]
    if existing_preg_methods:
        method_counts = male_df[existing_preg_methods].sum().reset_index()
        method_counts.columns = ['妊娠方式', '样本数量']
        method_counts['妊娠方式'] = method_counts['妊娠方式'].str.replace('妊娠方式_', '')
        method_counts['比例'] = (method_counts['样本数量'] / method_counts['样本数量'].sum() * 100).round(2)
        
        # 生成美观的表格文本并保存为文件
        table_text = tabulate(method_counts, headers='keys', tablefmt='grid', showindex=False, numalign="center")
        save_table_to_file(table_text, '表1_妊娠方式统计.txt', '表1: 妊娠方式统计')
        
        print("\n--- 表格: 妊娠方式统计 ---")
        print(table_text)

    # --- 可视化图表 ---
    # 图1: 技术重复处理效果对比 (使用深海宝石色系)
    plt.figure(figsize=(10, 8))
    bars = plt.bar(['处理前', '处理后'], [len(male_df_raw), len(male_df)], 
                   color=[PALETTE_DEEP[0], PALETTE_DEEP[4]], 
                   edgecolor='white', linewidth=2, alpha=0.9)
    
    # 添加数值标签
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 5,
                f'{int(height)}', ha='center', va='bottom', fontsize=18, fontweight='bold')
    
    plt.ylabel('样本记录数', fontsize=20, fontweight='bold')
    plt.title('技术重复处理效果对比', fontsize=22, fontweight='bold', pad=20)
    plt.grid(True, alpha=0.3, linestyle=':')
    fig_path = os.path.join(RESULT_DIR, '图1_技术重复处理效果.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"\n已生成图表: {fig_path}")

    # 图2: 男胎关键数值变量分布 (使用春日花园色系)
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    variables = ['孕周', '孕妇BMI', '年龄', 'Y染色体浓度']
    colors = [PALETTE_SPRING[0], PALETTE_SPRING[1], PALETTE_SPRING[2], PALETTE_SPRING[3]]
    
    for i, (var, color) in enumerate(zip(variables, colors)):
        ax = axes[i//2, i%2]
        sns.histplot(male_df[var], kde=True, ax=ax, color=color, bins=20, 
                    alpha=0.8, edgecolor='white', linewidth=1.5)
        ax.set_xlabel(var, fontsize=18, fontweight='bold')
        ax.set_ylabel('频数', fontsize=18, fontweight='bold')
        ax.grid(True, alpha=0.3, linestyle=':')
        
        # 添加统计信息
        stats_text = f'均值: {male_df[var].mean():.2f}\n标准差: {male_df[var].std():.2f}'
        ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, 
               fontsize=14, verticalalignment='top', horizontalalignment='right',
               bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.8))
    
    plt.suptitle('男胎关键数值变量分布', fontsize=24, fontweight='bold', y=0.98)
    plt.tight_layout()
    fig_path = os.path.join(RESULT_DIR, '图2_男胎关键数值变量分布.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"已生成图表: {fig_path}")

    # 图3: 男胎关键分类变量统计 (使用莫兰迪高级色系)
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    
    # 子图1: Y浓度达标统计
    sns.countplot(x='Y浓度是否达标', data=male_df, ax=axes[0], 
                  palette=[PALETTE_MORANDI[0], PALETTE_MORANDI[3]],
                  edgecolor='white', linewidth=2, alpha=0.9)
    axes[0].set_xlabel('Y浓度是否达标 (0:否, 1:是)', fontsize=18, fontweight='bold')
    axes[0].set_ylabel('样本数量', fontsize=18, fontweight='bold')
    
    # 子图2: 怀孕次数统计
    preg_cols = ['怀孕次数_1', '怀孕次数_2', '怀孕次数_≥3']
    preg_counts = male_df[preg_cols].idxmax(axis=1)
    order_logic = ['怀孕次数_1', '怀孕次数_2', '怀孕次数_≥3']
    sns.countplot(y=preg_counts, ax=axes[1], 
                  palette=PALETTE_MORANDI[3:6],
                  edgecolor='white', linewidth=2, alpha=0.9, order=order_logic)
    axes[1].set_xlabel('样本数量', fontsize=18, fontweight='bold')
    axes[1].set_ylabel('怀孕次数', fontsize=18, fontweight='bold')
    axes[1].set_yticklabels(['1次', '2次', '≥3次'], fontsize=16)
    
    plt.suptitle('男胎关键分类变量统计', fontsize=22, fontweight='bold', y=0.98)
    plt.tight_layout()
    fig_path = os.path.join(RESULT_DIR, '图3_男胎关键分类变量统计.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"已生成图表: {fig_path}")

    # 图4: 女胎染色体异常统计 (使用高级珠宝色系)
    female_abnormal_counts = female_df[['is_T13', 'is_T18', 'is_T21']].sum().reset_index()
    female_abnormal_counts.columns = ['异常类型', '样本数']
    female_abnormal_counts['异常类型'] = female_abnormal_counts['异常类型'].str.replace('is_', 'T')
    
    plt.figure(figsize=(12, 8))
    bars = plt.bar(female_abnormal_counts['异常类型'], female_abnormal_counts['样本数'],
                  color=[PALETTE_JEWEL[1], PALETTE_JEWEL[2], PALETTE_JEWEL[3]],
                  edgecolor='white', linewidth=2, alpha=0.9)
    
    # 添加数值标签
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{int(height)}', ha='center', va='bottom', fontsize=18, fontweight='bold')
    
    plt.xlabel('染色体异常类型', fontsize=20, fontweight='bold')
    plt.ylabel('检测出的样本数', fontsize=20, fontweight='bold')
    plt.title('女胎染色体异常统计', fontsize=22, fontweight='bold', pad=20)
    plt.grid(True, alpha=0.3, linestyle=':')
    fig_path = os.path.join(RESULT_DIR, '图4_女胎染色体异常统计.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"已生成图表: {fig_path}")

    print("\n所有可视化任务执行完毕，极致美观的图表和表格已生成！")

if __name__ == '__main__':
    run_final_visualization()

           开始执行极致美观版数据可视化流程
表格已保存: Result\表1_妊娠方式统计.txt

--- 表格: 妊娠方式统计 ---
+-----------------+------------+--------+
| 妊娠方式        |  样本数量  |  比例  |
| 自然受孕        |    1005    | 98.43  |
+-----------------+------------+--------+
| IUI（人工授精） |     8      |  0.78  |
+-----------------+------------+--------+
| IVF（试管婴儿） |     8      |  0.78  |
+-----------------+------------+--------+

已生成图表: Result\图1_技术重复处理效果.png
已生成图表: Result\图2_男胎关键数值变量分布.png
已生成图表: Result\图3_男胎关键分类变量统计.png
已生成图表: Result\图4_女胎染色体异常统计.png

所有可视化任务执行完毕，极致美观的图表和表格已生成！
