In [42]:
# 文件名: 1_data_preprocessing_final.py

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

# --- 辅助函数 ---
def flag_outliers_iqr(df, column_name):
    """使用IQR方法为指定列标记异常值 (新增列)"""
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    flag_col = f'{column_name}_is_outlier'
    df[flag_col] = ((df[column_name] < lower_bound) | (df[column_name] > upper_bound)).astype(int)
    print(f"    - 在 '{column_name}' 中检测到 {df[flag_col].sum()} 个异常值。")
    return df

def robust_convert_gestational_week(series):
    """健壮的孕周转换函数，兼容多种格式并提供警告"""
    weeks_numeric = []
    for item in series:
        if pd.isna(item):
            weeks_numeric.append(np.nan)
            continue
        try:
            weeks_numeric.append(float(item))
            continue
        except (ValueError, TypeError):
            pass
        s_item = str(item)
        match = pd.Series(s_item).str.extract(r'(\d+\.?\d*)\s*(?:w)?\s*(?:\+\s*(\d+))?').iloc[0]
        if pd.notna(match[0]):
            week = float(match[0])
            day = float(match[1]) if pd.notna(match[1]) else 0
            weeks_numeric.append(week + day / 7.0)
        else:
            print(f"    - 警告: 无法解析孕周值 '{item}'，已置为缺失值。")
            weeks_numeric.append(np.nan)
    return pd.Series(weeks_numeric, index=series.index)

# --- 主处理函数 ---
def run_data_processing(male_raw_df, female_raw_df):
    """执行完整的数据预处理流程"""
    processed_dfs = {}
    for name, df_raw in [('男胎', male_raw_df), ('女胎', female_raw_df)]:
        df = df_raw.copy()
        print(f"\n--- 开始处理 {name} 数据 ---")
        print(f"原始数据维度: {df.shape}")

        # === 阶段零：缺失值检测 ===
        print("执行缺失值检测...")
        missing_summary = df.isnull().sum()
        missing_summary = missing_summary[missing_summary > 0]
        if not missing_summary.empty:
            print("原始数据中发现以下缺失值：")
            print(missing_summary.to_string())
        else:
            print("原始数据中无缺失值。")

        # === 阶段一：基础清洗与结构重组 ===
        df.drop(columns=['序号', '末次月经'], inplace=True, errors='ignore')
        df['检测日期'] = pd.to_datetime(df['检测日期'], format='%Y%m%d')
        df['孕周'] = robust_convert_gestational_week(df['检测孕周'])
        
        if '孕妇BMI' in df.columns:
            bmi_missing_count = df['孕妇BMI'].isnull().sum()
            if bmi_missing_count > 0 and '身高' in df.columns and '体重' in df.columns:
                print(f"发现 {bmi_missing_count} 个缺失的BMI值，尝试通过身高体重计算...")
                df['孕妇BMI'] = df['孕妇BMI'].fillna(df['体重'] / ((df['身高'] / 100) ** 2))
                filled_count = bmi_missing_count - df['孕妇BMI'].isnull().sum()
                print(f"    - 成功计算并填充了 {filled_count} 个BMI值。")
        
        group_keys = ['孕妇代码', '检测抽血次数']
        df['检测抽血次数'] = pd.to_numeric(df['检测抽血次数'], errors='coerce').astype('Int64')
        df.dropna(subset=group_keys, inplace=True)
        
        cols_to_average = [
            '身高', '体重', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', 
            '唯一比对的读段数  ', 'GC含量', '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 
            'X染色体的Z值', 'Y染色体的Z值', 'Y染色体浓度', 'X染色体浓度', '13号染色体的GC含量', 
            '18号染色体的GC含量', '21号染色体的GC含量', '被过滤掉读段数的比例'
        ]
        
        agg_dict = {col: 'first' for col in df.columns if col not in group_keys and col not in cols_to_average}
        for col in cols_to_average:
            if col in df.columns:
                agg_dict[col] = 'mean'
        if '染色体的非整倍体' in df.columns:
            agg_dict['染色体的非整倍体'] = lambda s: s.dropna().unique()[0] if not s.dropna().empty else np.nan
        
        df = df.groupby(group_keys, as_index=False).agg(agg_dict)
        print(f"聚合重复检测后维度: {df.shape}")
        
        df.sort_values(by=['孕妇代码', '检测日期'], inplace=True)
        df['计算序次'] = df.groupby('孕妇代码').cumcount() + 1
        df['检测抽血次数'] = df['计算序次']
        df.drop(columns=['计算序次'], inplace=True)

        # === 阶段二：数据质量保证与异常值处理 ===
        print("执行数据质量检测...")
        
        # 【核心修正】将IQR检测范围限定为您指定的三个特征
        iqr_cols = ['年龄', '孕妇BMI', '孕周']
        for col in iqr_cols:
            if col in df.columns:
                df = flag_outliers_iqr(df, col)
        
        if '染色体的非整倍体' in df.columns:
            df['染色体的非整倍体'] = df['染色体的非整倍体'].fillna('正常')
            normal_samples = df[df['染色体的非整倍体'] == '正常']
            z_score_cols = [col for col in df.columns if 'Z值' in col]
            for col in z_score_cols:
                if col in df.columns:
                    Q1 = normal_samples[col].quantile(0.25)
                    Q3 = normal_samples[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - 1.5 * IQR
                    upper_bound = Q3 + 1.5 * IQR
                    df[f'{col}_is_outlier'] = ((df[col] < lower_bound) | (df[col] > upper_bound)).astype(int)

        # === 阶段三：特征工程与模型适配 ===
        print("执行特征工程...")
        
        if name == '男胎':
            df['Y浓度是否达标'] = (df['Y染色体浓度'] >= 0.04).astype(int)
        if name == '女胎' and '染色体的非整倍体' in df.columns:
            df['是否异常'] = (df['染色体的非整倍体'] != '正常').astype(int)
            df['is_T13'] = df['染色体的非整倍体'].str.contains('13').astype(int)
            df['is_T18'] = df['染色体的非整倍体'].str.contains('18').astype(int)
            df['is_T21'] = df['染色体的非整倍体'].str.contains('21').astype(int)

        df['距首次检测天数'] = (df['检测日期'] - df.groupby('孕妇代码')['检测日期'].transform('min')).dt.days
        risk_bins = [0, 12, 27, 100]
        risk_labels = [1, 2, 3] # 1:低风险, 2:高风险, 3:极高风险
        df['风险等级'] = pd.cut(df['孕周'], bins=risk_bins, labels=risk_labels, right=True)

        df['怀孕次数'] = df['怀孕次数'].replace({'>=3': '≥3'}).astype(str)
        categorical_cols = ['怀孕次数', '生产次数']
        df = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns], dtype=int)
        
        if 'IVF妊娠' in df.columns:
            df.drop(columns=['IVF妊娠'], inplace=True)
        
        if name == '女胎':
            print("对女胎数据执行特征缩放...")
            numeric_predictors = df.select_dtypes(include=np.number).columns.tolist()
            cols_to_exclude = [
                '孕妇代码', '检测抽血次数', '是否异常', 'is_T13', 'is_T18', 'is_T21'
            ] + [col for col in df.columns if '_is_' in col or '_is_abnormal' in col]
            numeric_predictors = [col for col in numeric_predictors if col not in cols_to_exclude]

            zero_variance_cols = [col for col in numeric_predictors if df[col].nunique() <= 1]
            if zero_variance_cols:
                print(f"    - 警告: 发现并移除零方差特征: {zero_variance_cols}")
                df.drop(columns=zero_variance_cols, inplace=True)
                numeric_predictors = [col for col in numeric_predictors if col not in zero_variance_cols]

            missing_before_impute = df[numeric_predictors].isnull().sum()
            missing_cols = missing_before_impute[missing_before_impute > 0]
            if not missing_cols.empty:
                print("    - 警告: 为确保模型运行，对以下仍存在缺失值的特征执行中位数填充：")
                print(missing_cols.to_string())
                df[numeric_predictors] = df[numeric_predictors].fillna(df[numeric_predictors].median())
            
            scaler = StandardScaler()
            if numeric_predictors:
                df[numeric_predictors] = scaler.fit_transform(df[numeric_predictors])
                print("    - 特征缩放完成。")

        # === 阶段四：最终清理 ===
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f"处理完成，最终维度: {df.shape}")
        processed_dfs[name] = df
    
    return processed_dfs.get('男胎'), processed_dfs.get('女胎')

if __name__ == '__main__':
    INPUT_FILE_PATH = '../../Data/附件.xlsx'
    RESULT_DIR = 'Result'
    os.makedirs(RESULT_DIR, exist_ok=True)
    
    male_df_raw = pd.read_excel(INPUT_FILE_PATH, sheet_name='男胎检测数据')
    female_df_raw = pd.read_excel(INPUT_FILE_PATH, sheet_name='女胎检测数据')
    
    male_df_processed, female_df_processed = run_data_processing(male_df_raw, female_raw_df)
    
    male_processed_path = os.path.join(RESULT_DIR, '男胎-预处理后数据.csv')
    female_processed_path = os.path.join(RESULT_DIR, '女胎-预处理后数据.csv')
    
    male_df_processed.to_csv(male_processed_path, index=False, encoding='utf-8-sig')
    female_df_processed.to_csv(female_processed_path, index=False, encoding='utf-8-sig')
    
    print(f"\n预处理流程结束，数据已保存至 '{RESULT_DIR}' 文件夹。")


--- 开始处理 男胎 数据 ---
原始数据维度: (1082, 31)
执行缺失值检测...
原始数据中发现以下缺失值：
末次月经         12
染色体的非整倍体    956
聚合重复检测后维度: (1021, 30)
执行数据质量检测...
    - 在 '年龄' 中检测到 19 个异常值。
    - 在 '孕妇BMI' 中检测到 26 个异常值。
    - 在 '孕周' 中检测到 0 个异常值。
执行特征工程...
处理完成，最终维度: (1021, 45)

--- 开始处理 女胎 数据 ---
原始数据维度: (605, 31)
执行缺失值检测...
原始数据中发现以下缺失值：
孕妇BMI            1
Unnamed: 20    605
Unnamed: 21    605
染色体的非整倍体       538
发现 1 个缺失的BMI值，尝试通过身高体重计算...
    - 成功计算并填充了 1 个BMI值。
聚合重复检测后维度: (554, 30)
执行数据质量检测...
    - 在 '年龄' 中检测到 17 个异常值。
    - 在 '孕妇BMI' 中检测到 22 个异常值。
    - 在 '孕周' 中检测到 0 个异常值。
执行特征工程...
对女胎数据执行特征缩放...
    - 警告: 发现并移除零方差特征: ['Unnamed: 20', 'Unnamed: 21']
    - 特征缩放完成。
处理完成，最终维度: (554, 44)

预处理流程结束，数据已保存至 'Result' 文件夹。


In [43]:
# 文件名: 2_visualization_final.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- 全局绘图设置 ---
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 18
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['legend.fontsize'] = 14
sns.set_palette("cividis") 

# --- 为绘图准备数据的辅助函数 (保持不变) ---
def prepare_unscaled_data(raw_df):
    df = raw_df.copy()
    df.drop(columns=['序号', '末次月经'], inplace=True, errors='ignore')
    if '检测日期' in df.columns:
        df['检测日期'] = pd.to_datetime(df['检测日期'], format='%Y%m%d')
    if '检测孕周' in df.columns:
        extracted = df['检测孕周'].astype(str).str.extract(r'(\d+)(?:w)?(?:\+(\d+))?').astype(float)
        df['孕周'] = extracted[0] + extracted[1].fillna(0) / 7
    if '孕妇BMI' in df.columns and '身高' in df.columns and '体重' in df.columns:
        df['孕妇BMI'] = df['孕妇BMI'].fillna(df['体重'] / ((df['身高'] / 100) ** 2))
    group_keys = ['孕妇代码', '检测抽血次数']
    if '检测抽血次数' in df.columns:
        df['检测抽血次数'] = pd.to_numeric(df['检测抽血次数'], errors='coerce').astype('Int64')
    df.dropna(subset=group_keys, inplace=True)
    cols_to_average = ['身高', '体重', '孕妇BMI', '原始读段数', '在参考基因组上比对的比例', '重复读段的比例', '唯一比对的读段数  ', 'GC含量', '13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值', 'X染色体的Z值', 'Y染色体的Z值', 'Y染色体浓度', 'X染色体浓度', '13号染色体的GC含量', '18号染色体的GC含量', '21号染色体的GC含量', '被过滤掉读段数的比例']
    agg_dict = {col: 'first' for col in df.columns if col not in group_keys and col not in cols_to_average}
    for col in cols_to_average:
        if col in df.columns: agg_dict[col] = 'mean'
    if '染色体的非整倍体' in df.columns:
        agg_dict['染色体的非整倍体'] = lambda s: s.dropna().unique()[0] if not s.dropna().empty else np.nan
    df = df.groupby(group_keys, as_index=False).agg(agg_dict)
    return df

# --- 可视化函数 ---

def plot_step1_raw_distributions(df, result_dir):
    fig, axes = plt.subplots(1, 3, figsize=(24, 7))
    sns.histplot(df['年龄'].dropna(), kde=True, ax=axes[0], color='#3b528b', bins=20)
    axes[0].set_xlabel('年龄', fontsize=20)
    axes[0].set_ylabel('频数', fontsize=20)
    sns.histplot(df['孕妇BMI'].dropna(), kde=True, ax=axes[1], color='#21918c', bins=20)
    axes[1].set_xlabel('孕妇BMI', fontsize=20)
    axes[1].set_ylabel('')
    sns.histplot(df['孕周'].dropna(), kde=True, ax=axes[2], color='#5ec962', bins=20)
    axes[2].set_xlabel('数值化孕周', fontsize=20)
    axes[2].set_ylabel('')
    plt.tight_layout()
    save_path = os.path.join(result_dir, '图1-核心特征分布.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()

def plot_step3_aggregation_effect(raw_df, aggregated_df, result_dir):
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    raw_counts = raw_df.groupby('孕妇代码')['检测日期'].count()
    sns.countplot(x=raw_counts, ax=axes[0], palette='cividis')
    axes[0].set_xlabel('聚合前样本记录数', fontsize=20)
    axes[0].set_ylabel('孕妇人数 (对数尺度)', fontsize=20)
    axes[0].set_yscale('log')
    agg_counts = aggregated_df.groupby('孕妇代码')['检测抽血次数'].count()
    sns.countplot(x=agg_counts, ax=axes[1], palette='cividis')
    axes[1].set_xlabel('聚合后有效检测次数', fontsize=20)
    axes[1].set_ylabel('')
    axes[1].set_yscale('log')
    plt.tight_layout()
    save_path = os.path.join(result_dir, '图2-聚合效果对比.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()
    
def plot_step4_iqr_outlier_analysis(df, result_dir):
    """【已修正】仅针对年龄、BMI和孕周进行离群点分析"""
    cols = ['年龄', '孕妇BMI', '孕周']
    fig, axes = plt.subplots(1, 3, figsize=(24, 8)) # 调整为1x3布局
    
    for i, col in enumerate(cols):
        sns.violinplot(y=df[col], ax=axes[i], inner='box', color=sns.color_palette("viridis", 3)[i], linewidth=2.5)
        
        outlier_flag_col = f'{col}_is_outlier'
        df_plot = pd.DataFrame({'value': df[col], '离群点': df[outlier_flag_col] == 1})

        sns.stripplot(data=df_plot, y='value', hue='离群点', ax=axes[i], palette={False: 'gray', True: 'red'}, size=6, jitter=0.2, alpha=0.7)
        
        axes[i].set_xlabel(col, fontsize=20)
        axes[i].set_ylabel('数值分布' if i == 0 else '', fontsize=20)
        axes[i].legend(title='是否为离群点', loc='upper right')
        axes[i].set_xticks([])
        
    plt.tight_layout()
    save_path = os.path.join(result_dir, '图3-IQR离群点分析.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()

def plot_step5_gc_content_distribution(df, result_dir):
    fig, ax = plt.subplots(figsize=(12, 8))
    sns.histplot(df['GC含量'].dropna(), kde=True, color='#440154', bins=30, ax=ax)
    ax.set_xlabel('GC含量 (%)', fontsize=20)
    ax.set_ylabel('频数', fontsize=20)
    ax.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    save_path = os.path.join(result_dir, '图4-GC含量分布探索.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()

def plot_step6_abnormality_distribution(df_female, result_dir):
    plt.figure(figsize=(12, 8))
    aneuploidy_cols = ['is_T13', 'is_T18', 'is_T21']
    counts = df_female[aneuploidy_cols].sum()
    if '是否异常' in df_female.columns:
        normal_count = (df_female['是否异常'] == 0).sum()
        counts['正常'] = normal_count
    counts = counts.sort_values(ascending=False)
    ax = sns.barplot(x=counts.index, y=counts.values, palette='plasma')
    ax.set_xlabel('样本类型', fontsize=20)
    ax.set_ylabel('样本数量', fontsize=20)
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha = 'center', va = 'center', 
                    xytext = (0, 10), 
                    textcoords = 'offset points',
                    fontsize=16)
    plt.tight_layout()
    save_path = os.path.join(result_dir, '图5-染色体异常类型分布.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()
    
def plot_step7_feature_scaling_effect(df_scaled, df_unscaled, result_dir):
    # (此函数保持不变)
    numeric_features = df_scaled.select_dtypes(include=np.number).columns.tolist()
    exclude_cols_base = ['孕妇代码', '检测抽血次数']
    exclude_cols_flags = [col for col in df_scaled.columns if '_is_' in col or '_is_abnormal' in col or 'is_T' in col or '是否异常' in col]
    exclude_cols = exclude_cols_base + exclude_cols_flags
    features_to_plot = sorted([col for col in numeric_features if col not in exclude_cols and col in df_unscaled.columns])
    
    if len(features_to_plot) > 15:
        print(f"    - 警告: 待可视化缩放特征数量过多({len(features_to_plot)}个)，仅展示前15个。")
        features_to_plot = features_to_plot[:15]

    n_features = len(features_to_plot)
    n_cols = 4
    n_rows = (n_features + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
    axes = axes.flatten()

    for i, feature in enumerate(features_to_plot):
        ax = axes[i]
        sns.kdeplot(df_unscaled[feature].dropna(), ax=ax, color='#3b528b', fill=True, alpha=0.7, label='缩放前')
        ax.set_ylabel('密度 (缩放前)', fontsize=14, color='#3b528b')
        ax.tick_params(axis='y', labelcolor='#3b528b')
        ax2 = ax.twinx()
        sns.kdeplot(df_scaled[feature].dropna(), ax=ax2, color='#21918c', fill=True, alpha=0.6, label='缩放后')
        ax2.set_ylabel('密度 (缩放后)', fontsize=14, color='#21918c')
        ax2.tick_params(axis='y', labelcolor='#21918c')
        ax.set_xlabel(feature, fontsize=16)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    save_path = os.path.join(result_dir, '图6-所有数值特征缩放效果对比.png')
    plt.savefig(save_path, dpi=300)
    print(f"图表已保存: {save_path}")
    plt.close()

# --- 主流程 ---
if __name__ == '__main__':
    RESULT_DIR = 'Result'
    INPUT_FILE_PATH = '../../Data/附件.xlsx'
    
    male_processed_path = os.path.join(RESULT_DIR, '男胎-预处理后数据.csv')
    female_processed_path = os.path.join(RESULT_DIR, '女胎-预处理后数据.csv')
    
    if os.path.exists(male_processed_path) and os.path.exists(female_processed_path):
        print("--- 开始生成最终版可视化报告 ---")
        
        male_df = pd.read_csv(male_processed_path)
        female_df_scaled = pd.read_csv(female_processed_path)
        
        male_raw_df = pd.read_excel(INPUT_FILE_PATH, sheet_name='男胎检测数据')
        female_raw_df = pd.read_excel(INPUT_FILE_PATH, sheet_name='女胎检测数据')
        
        male_cleaned = prepare_unscaled_data(male_raw_df)
        female_cleaned = prepare_unscaled_data(female_raw_df)
        combined_cleaned = pd.concat([male_cleaned, female_cleaned], ignore_index=True)
        
        female_cleaned_with_flags = female_cleaned.copy()
        flag_cols = [col for col in female_df_scaled.columns if '_is_outlier' in col]
        female_df_scaled_indexed = female_df_scaled.set_index(['孕妇代码', '检测抽血次数'])
        female_cleaned_with_flags = female_cleaned.set_index(['孕妇代码', '检测抽血次数'])
        for flag_col in flag_cols:
            if flag_col in female_df_scaled_indexed.columns:
                 female_cleaned_with_flags[flag_col] = female_df_scaled_indexed[flag_col]
        female_cleaned_with_flags.reset_index(inplace=True)
        
        df_for_iqr_plot = pd.concat([male_df, female_cleaned_with_flags], ignore_index=True)

        # --- 按照处理步骤，依次调用绘图函数 ---
        plot_step1_raw_distributions(combined_cleaned, RESULT_DIR)
        plot_step3_aggregation_effect(pd.concat([male_raw_df, female_raw_df]), pd.concat([male_df, female_df_scaled]), RESULT_DIR)
        plot_step4_iqr_outlier_analysis(df_for_iqr_plot, RESULT_DIR)
        plot_step5_gc_content_distribution(combined_cleaned, RESULT_DIR)
        plot_step6_abnormality_distribution(female_df_scaled, RESULT_DIR)
        plot_step7_feature_scaling_effect(female_df_scaled, female_cleaned, RESULT_DIR)

        print("\n--- 所有可视化图表已生成完毕 ---")
    else:
        print(f"错误: 未找到预处理后的数据文件。请确保 '1_data_preprocessing_final.py' 已成功运行。")

--- 开始生成最终版可视化报告 ---
图表已保存: Result\图1-核心特征分布.png
图表已保存: Result\图2-聚合效果对比.png
图表已保存: Result\图3-IQR离群点分析.png
图表已保存: Result\图4-GC含量分布探索.png
图表已保存: Result\图5-染色体异常类型分布.png
    - 警告: 待可视化缩放特征数量过多(19个)，仅展示前15个。
图表已保存: Result\图6-所有数值特征缩放效果对比.png

--- 所有可视化图表已生成完毕 ---
