In [31]:
import pandas as pd
from scipy.stats import pearsonr
import argparse
import os
from statsmodels.stats.multitest import fdrcorrection

metadata_df = pd.read_csv('../data/IBD/IBD1/metadata.tsv', sep='\t', header=0)

In [32]:
# 增强版元数据统计函数
def enhanced_column_stats(df):
    results = []
    
    for col in df.columns:
        # 基本信息
        na_count = df[col].isna().sum()
        na_percentage = (na_count / len(df)) * 100
        dtype = df[col].dtype
        
        # 确定数据类型
        if pd.api.types.is_numeric_dtype(dtype):
            unique_values = df[col].dropna().unique()
            if len(unique_values) <= 10:
                data_type = 'Categorical'
            else:
                data_type = 'Numerical'
        else:
            data_type = 'Categorical'
            
        # 数据分布统计
        if data_type == 'Numerical':
            # 数值变量统计
            mean_val = df[col].mean()
            median_val = df[col].median()
            std_val = df[col].std()
            min_val = df[col].min()
            max_val = df[col].max()
            
            # 检查数据偏斜度
            from scipy import stats as scipystats
            skewness = scipystats.skew(df[col].dropna())
            
            # 检查数据范围和分布，帮助识别可能需要转换的变量
            summary = f"Mean={mean_val:.2f}, Median={median_val:.2f}, SD={std_val:.2f}, Range={min_val:.2f}-{max_val:.2f}, Skew={skewness:.2f}"
            
            # 计算变异系数(CV)，帮助判断变量的离散程度
            cv = std_val / mean_val if mean_val != 0 else float('inf')
            
            # 如果CV很大，可能不适合作为直接的混杂因素，可能需要转换
            cv_comment = "High Variability" if abs(cv) > 1 else "Normal Variability"
            
        else:
            # 分类变量统计
            value_counts = df[col].value_counts()
            n_categories = len(value_counts)
            most_common = value_counts.index[0] if not value_counts.empty else "N/A"
            most_common_count = value_counts.iloc[0] if not value_counts.empty else 0
            most_common_pct = (most_common_count / df[col].count()) * 100
            
            # 类别平衡性检查 - 不平衡的分类变量可能导致模型问题
            balance_ratio = value_counts.min() / value_counts.max() if len(value_counts) > 1 and value_counts.max() > 0 else 0
            balance_comment = "Imbalanced" if balance_ratio < 0.1 else "Balanced"
            
            summary = f"Categories={n_categories}, Most common='{most_common}' ({most_common_pct:.1f}%), Balance={balance_comment}"
            cv = None
            cv_comment = None
        
        # 添加相关性分析 - 只对数值型变量执行
        correlations = {}
        if data_type == 'Numerical':
            for other_col in df.select_dtypes(include=['number']).columns:
                if other_col != col:
                    # 计算与其他数值变量的相关性
                    correlation = df[[col, other_col]].dropna().corr().iloc[0, 1]
                    if abs(correlation) > 0.3:  # 只保存中等及以上相关性
                        correlations[other_col] = correlation
        
        # 是否推荐作为混杂因素的判断标准
        recommended = False
        recommendation_reason = []
        
        # 缺失值不能太多
        if na_percentage > 30:
            recommendation_reason.append(f"High missing data ({na_percentage:.1f}%)")
        
        # 数值变量评估
        elif data_type == 'Numerical':
            # 如果变异系数过大，可能需要转换
            if abs(cv) > 3:
                recommendation_reason.append("Extreme variability (consider transformation)")
            # 如果偏斜度过大，可能需要转换
            elif abs(skewness) > 3:
                recommendation_reason.append("Highly skewed (consider transformation)")
            else:
                recommended = True
        
        # 分类变量评估
        elif data_type == 'Categorical':
            # 太多类别的分类变量不适合
            if n_categories > 20:
                recommendation_reason.append(f"Too many categories ({n_categories})")
            # 极度不平衡的分类
            elif balance_ratio < 0.01 and n_categories > 1:
                recommendation_reason.append("Extremely imbalanced")
            # 单一类别占比过高
            elif most_common_pct > 95:
                recommendation_reason.append(f"Dominant category ({most_common_pct:.1f}%)")
            else:
                recommended = True
        
        if not recommendation_reason:
            recommendation_reason.append("Suitable as confounder")
        
        results.append({
            'Column': col,
            'Data Type': data_type,
            'NA Count': na_count,
            'NA Percentage': na_percentage,
            'Summary': summary,
            'Strong Correlations': str(correlations) if correlations else "None",
            'Recommended as Confounder': recommended,
            'Recommendation Reason': "; ".join(recommendation_reason)
        })
    
    return pd.DataFrame(results)

def process_metadata_files(df, outpath=None):
    """
    处理元数据并输出统计信息，打印推荐的混杂因素
    
    参数:
    df: 包含元数据的DataFrame
    outpath: 可选，如果提供则将统计结果保存到该路径
    
    返回:
    stats: 包含统计信息的DataFrame
    """
    # 获取增强的统计信息
    stats = enhanced_column_stats(df)
    stats = stats[stats['NA Count'] != len(df)]  # 移除全部为NA的列
    
    # 如果提供了输出路径，保存统计结果
    if outpath:
        stats.to_csv(outpath, sep='\t', index=False)
    
    # 获取推荐的混杂因素列表
    recommended_confounders = stats[stats['Recommended as Confounder'] == True]['Column'].tolist()
    recommended_numerical = stats[(stats['Recommended as Confounder'] == True) & 
                                 (stats['Data Type'] == 'Numerical')]['Column'].tolist()
    recommended_categorical = stats[(stats['Recommended as Confounder'] == True) & 
                                   (stats['Data Type'] == 'Categorical')]['Column'].tolist()
    
    # 打印推荐的混杂因素
    print("\n===== RECOMMENDED CONFOUNDERS =====")
    print(f"\nAll recommended confounders ({len(recommended_confounders)}):")
    print(", ".join(recommended_confounders))
    
    print(f"\nRecommended numerical confounders ({len(recommended_numerical)}):")
    print(", ".join(recommended_numerical))
    
    print(f"\nRecommended categorical confounders ({len(recommended_categorical)}):")
    print(", ".join(recommended_categorical))
    
    # 打印不推荐的变量及原因
    not_recommended = stats[stats['Recommended as Confounder'] == False]
    print(f"\nVariables NOT recommended as confounders ({len(not_recommended)}):")
    for _, row in not_recommended.iterrows():
        print(f"- {row['Column']}: {row['Recommendation Reason']}")
    
    return stats

In [33]:
stat = process_metadata_files(metadata_df)


===== RECOMMENDED CONFOUNDERS =====

All recommended confounders (12):
study_condition, disease, age, age_category, gender, country, number_reads, number_bases, minimum_read_length, median_read_length, BMI, mgs_richness

Recommended numerical confounders (6):
age, number_reads, number_bases, median_read_length, BMI, mgs_richness

Recommended categorical confounders (6):
study_condition, disease, age_category, gender, country, minimum_read_length

Variables NOT recommended as confounders (12):
- study_name: Dominant category (100.0%)
- sample_id: Too many categories (393)
- subject_id: Too many categories (316)
- body_site: Dominant category (100.0%)
- non_westernized: Dominant category (100.0%)
- sequencing_platform: Dominant category (100.0%)
- PMID: Dominant category (100.0%)
- NCBI_accession: Too many categories (393)
- curator: Dominant category (100.0%)
- days_from_first_collection: Highly skewed (consider transformation)
- disease_subtype: High missing data (62.8%)
- ferm_milk_p

In [34]:
stat

Unnamed: 0,Column,Data Type,NA Count,NA Percentage,Summary,Strong Correlations,Recommended as Confounder,Recommendation Reason
0,study_name,Categorical,0,0.0,"Categories=1, Most common='NielsenHB_2014' (10...",,False,Dominant category (100.0%)
1,sample_id,Categorical,0,0.0,"Categories=393, Most common='MH0001' (0.3%), B...",,False,Too many categories (393)
2,subject_id,Categorical,0,0.0,"Categories=316, Most common='O2_UC24' (0.5%), ...",,False,Too many categories (316)
3,body_site,Categorical,0,0.0,"Categories=1, Most common='stool' (100.0%), Ba...",,False,Dominant category (100.0%)
5,study_condition,Categorical,0,0.0,"Categories=2, Most common='control' (62.8%), B...",,True,Suitable as confounder
6,disease,Categorical,0,0.0,"Categories=2, Most common='Health' (62.8%), Ba...",,True,Suitable as confounder
7,age,Numerical,2,0.508906,"Mean=47.69, Median=49.00, SD=12.75, Range=19.0...","{'minimum_read_length': 0.542394161007958, 'BM...",True,Suitable as confounder
9,age_category,Categorical,0,0.0,"Categories=2, Most common='adult' (94.9%), Bal...",,True,Suitable as confounder
10,gender,Categorical,0,0.0,"Categories=2, Most common='female' (57.0%), Ba...",,True,Suitable as confounder
11,country,Categorical,0,0.0,"Categories=2, Most common='ESP' (55.0%), Balan...",,True,Suitable as confounder


In [35]:
stat[stat["Recommended as Confounder"]]

Unnamed: 0,Column,Data Type,NA Count,NA Percentage,Summary,Strong Correlations,Recommended as Confounder,Recommendation Reason
5,study_condition,Categorical,0,0.0,"Categories=2, Most common='control' (62.8%), B...",,True,Suitable as confounder
6,disease,Categorical,0,0.0,"Categories=2, Most common='Health' (62.8%), Ba...",,True,Suitable as confounder
7,age,Numerical,2,0.508906,"Mean=47.69, Median=49.00, SD=12.75, Range=19.0...","{'minimum_read_length': 0.542394161007958, 'BM...",True,Suitable as confounder
9,age_category,Categorical,0,0.0,"Categories=2, Most common='adult' (94.9%), Bal...",,True,Suitable as confounder
10,gender,Categorical,0,0.0,"Categories=2, Most common='female' (57.0%), Ba...",,True,Suitable as confounder
11,country,Categorical,0,0.0,"Categories=2, Most common='ESP' (55.0%), Balan...",,True,Suitable as confounder
16,number_reads,Numerical,0,0.0,"Mean=56663647.93, Median=56991169.00, SD=19442...","{'number_bases': 0.9784825159176457, 'median_r...",True,Suitable as confounder
17,number_bases,Numerical,0,0.0,"Mean=4091226718.01, Median=3994482577.00, SD=1...","{'number_reads': 0.9784825159176457, 'median_r...",True,Suitable as confounder
18,minimum_read_length,Categorical,0,0.0,"Categories=6, Most common='30' (55.0%), Balanc...",,True,Suitable as confounder
19,median_read_length,Numerical,0,0.0,"Mean=72.75, Median=74.00, SD=8.80, Range=44.00...","{'number_reads': 0.5027573829522167, 'number_b...",True,Suitable as confounder


In [36]:
eigensp_df = pd.read_csv('../result/large_scale_cohort/IBD/IBD1/eigenspecies/IBD1.eigenspecies.csv',index_col=0,header=0,sep="\t")

In [37]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pearsonr, skew
import warnings
warnings.filterwarnings('ignore')

def enhanced_column_stats(df):
    """
    为DataFrame中的每一列计算增强统计信息，判断是否适合作为混杂因素
    
    参数:
    df: 包含元数据的DataFrame
    
    返回:
    stats_df: 包含统计信息的DataFrame
    """
    results = []
    
    for col in df.columns:
        # 基本信息
        na_count = df[col].isna().sum()
        na_percentage = (na_count / len(df)) * 100
        dtype = df[col].dtype
        
        # 确定数据类型
        if pd.api.types.is_numeric_dtype(dtype):
            unique_values = df[col].dropna().unique()
            if len(unique_values) <= 10:
                data_type = 'Categorical'
            else:
                data_type = 'Numerical'
        else:
            data_type = 'Categorical'
            
        # 数据分布统计
        if data_type == 'Numerical':
            # 数值变量统计
            non_na_values = df[col].dropna()
            if len(non_na_values) > 0:
                mean_val = non_na_values.mean()
                median_val = non_na_values.median()
                std_val = non_na_values.std()
                min_val = non_na_values.min()
                max_val = non_na_values.max()
                
                # 检查数据偏斜度
                skewness = skew(non_na_values) if len(non_na_values) > 2 else 0
                
                # 检查数据范围和分布
                summary = f"Mean={mean_val:.2f}, Median={median_val:.2f}, SD={std_val:.2f}, Range={min_val:.2f}-{max_val:.2f}, Skew={skewness:.2f}"
                
                # 计算变异系数(CV)
                cv = std_val / abs(mean_val) if mean_val != 0 else float('inf')
                cv_comment = "High Variability" if abs(cv) > 1 else "Normal Variability"
            else:
                summary = "No valid data"
                cv = float('inf')
                cv_comment = "No data"
                skewness = 0
                
        else:
            # 分类变量统计
            value_counts = df[col].dropna().value_counts()
            n_categories = len(value_counts)
            
            if not value_counts.empty:
                most_common = value_counts.index[0]
                most_common_count = value_counts.iloc[0]
                most_common_pct = (most_common_count / df[col].count()) * 100
                
                # 类别平衡性检查
                balance_ratio = value_counts.min() / value_counts.max() if len(value_counts) > 1 and value_counts.max() > 0 else 0
                balance_comment = "Imbalanced" if balance_ratio < 0.1 else "Balanced"
                
                summary = f"Categories={n_categories}, Most common='{most_common}' ({most_common_pct:.1f}%), Balance={balance_comment}"
            else:
                summary = "No valid data"
                balance_ratio = 0
                
            cv = None
            cv_comment = None
            skewness = None
        
        # 添加相关性分析 - 只对数值型变量执行
        correlations = {}
        if data_type == 'Numerical':
            for other_col in df.select_dtypes(include=['number']).columns:
                if other_col != col:
                    # 计算与其他数值变量的相关性
                    corr_data = df[[col, other_col]].dropna()
                    if len(corr_data) > 5:  # 确保有足够的数据点
                        correlation = corr_data.corr().iloc[0, 1]
                        if abs(correlation) > 0.3:  # 只保存中等及以上相关性
                            correlations[other_col] = correlation
        
        # 是否推荐作为混杂因素的判断标准
        recommended = False
        recommendation_reason = []
        
        # 排除样本ID和一些特定的非混杂列
        exclude_cols = ['sample_id', 'subject_id', 'NCBI_accession', 'PMID', 'curator']
        if col in exclude_cols:
            recommendation_reason.append("Identifier or metadata column")
        
        # 缺失值不能太多
        elif na_percentage > 30:
            recommendation_reason.append(f"High missing data ({na_percentage:.1f}%)")
        
        # 数值变量评估
        elif data_type == 'Numerical':
            # 如果变异系数过大或数据极度偏斜，可能需要转换
            if cv is not None and abs(cv) > 3:
                recommendation_reason.append("Extreme variability")
            elif skewness is not None and abs(skewness) > 3:
                recommendation_reason.append("Highly skewed")
            else:
                recommended = True
        
        # 分类变量评估
        elif data_type == 'Categorical':
            # 太多类别的分类变量不适合
            if n_categories > 20:
                recommendation_reason.append(f"Too many categories ({n_categories})")
            # 极度不平衡的分类
            elif balance_ratio < 0.01 and n_categories > 1:
                recommendation_reason.append("Extremely imbalanced")
            # 单一类别占比过高
            elif 'most_common_pct' in locals() and most_common_pct > 95:
                recommendation_reason.append(f"Dominant category ({most_common_pct:.1f}%)")
            else:
                recommended = True
        
        if not recommendation_reason:
            recommendation_reason.append("Suitable as confounder")
        
        results.append({
            'Column': col,
            'Data Type': data_type,
            'NA Count': na_count,
            'NA Percentage': na_percentage,
            'Summary': summary,
            'Strong Correlations': str(correlations) if correlations else "None",
            'Recommended as Confounder': recommended,
            'Recommendation Reason': "; ".join(recommendation_reason)
        })
    
    return pd.DataFrame(results)


def analyze_target_with_confounders(eigensp_df, metadata_df, stats_df):
    """
    分析每个目标因素与eigenspecies的关系，同时控制其他混杂因素
    
    参数:
    eigensp_df: DataFrame，包含模块和eigenspecies信息，至少包含'module', 'sample', 'eigensp'列
    metadata_df: DataFrame，包含元数据信息
    stats_df: DataFrame，包含元数据统计信息，包含'Column', 'Recommended as Confounder', 'Data Type'列
    
    返回:
    results_df: DataFrame，包含分析结果
    """
    # 获取所有推荐的混杂因素
    confounders = stats_df[stats_df["Recommended as Confounder"] == True]["Column"].tolist()
    
    # 获取数值型和分类型混杂因素
    numerical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                    (stats_df["Data Type"] == "Numerical")]["Column"].tolist()
    categorical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                      (stats_df["Data Type"] == "Categorical")]["Column"].tolist()
    
    # 准备存储结果的列表
    all_results = []
    
    # 对每个混杂因素，轮流将其作为目标因素，其余作为控制变量
    for target_col in confounders:
        # 确定当前目标的数据类型
        target_type = stats_df[stats_df["Column"] == target_col]["Data Type"].values[0]
        
        # 排除当前目标，其余作为混杂因素
        current_confounders = [col for col in confounders if col != target_col]
        
        # 对每个模块进行分析
        for module in eigensp_df['module'].unique():
            module_df = eigensp_df[eigensp_df['module'] == module]
            
            # 合并eigenspecies和元数据
            merged_df = module_df.merge(metadata_df, left_on='sample', right_on='sample_id', how='inner')
            
            # 准备模型数据，确保包含所有需要的列
            model_cols = [target_col] + current_confounders + ['eigensp']
            available_cols = [col for col in model_cols if col in merged_df.columns]
            
            # 检查是否有足够的数据
            if len(available_cols) < len(model_cols) - 5:  # 允许缺少少量列
                continue
                
            model_data = merged_df[available_cols].copy()
            
            # 处理缺失值
            model_data = model_data.dropna()
            
            if len(model_data) < 20:  # 确保有足够的样本
                continue
            
            # 1. 首先计算简单相关系数（未调整混杂因素）
            try:
                if target_type == "Numerical":
                    # 数值型目标 - 使用Pearson相关
                    simple_corr, simple_p = pearsonr(model_data[target_col], model_data['eigensp'])
                else:
                    # 分类型目标 - 只计算均值差异
                    simple_corr = None
                    simple_p = None
            except Exception:
                simple_corr = None
                simple_p = None
            
            # 2. 使用回归模型控制混杂因素
            try:
                # 准备分类变量 - 创建哑变量
                cat_cols = [col for col in available_cols if col in categorical_confounders]
                if cat_cols:
                    model_data_encoded = pd.get_dummies(model_data, columns=cat_cols, drop_first=True)
                else:
                    model_data_encoded = model_data.copy()
                
                # 为模型准备自变量（特征）和因变量
                y = model_data_encoded['eigensp']
                X_cols = [col for col in model_data_encoded.columns if col != 'eigensp']
                X = model_data_encoded[X_cols]
                
                # 添加常数项（截距）
                X = sm.add_constant(X)
                
                # 拟合线性模型
                model = sm.OLS(y, X).fit()
                
                # 提取目标变量的系数和p值
                if target_type == "Numerical":
                    # 数值型目标 - 直接获取系数
                    coef = model.params.get(target_col, np.nan)
                    p_value = model.pvalues.get(target_col, np.nan)
                    std_error = model.bse.get(target_col, np.nan)
                else:
                    # 分类型目标 - 可能有多个哑变量，获取第一个
                    target_cols = [col for col in model.params.index if col.startswith(f"{target_col}_")]
                    if target_cols:
                        coef = model.params[target_cols[0]]
                        p_value = model.pvalues[target_cols[0]]
                        std_error = model.bse[target_cols[0]]
                    else:
                        coef = np.nan
                        p_value = np.nan
                        std_error = np.nan
                
                # 保存结果
                result = {
                    'module': module,
                    'target': target_col,
                    'target_type': target_type,
                    'sample_size': len(model_data),
                    
                    # 简单相关结果
                    'simple_correlation': simple_corr,
                    'simple_p_value': simple_p,
                    
                    # 调整后的结果
                    'adjusted_coefficient': coef,
                    'adjusted_p_value': p_value,
                    'adjusted_std_error': std_error,
                    'adjusted_r2': model.rsquared_adj,
                    
                    # 记录使用的混杂因素
                    'confounders_used': ','.join([col for col in current_confounders if col in available_cols])
                }
                
                all_results.append(result)
                
            except Exception:
                # 静默跳过错误
                continue
    
    # 创建结果DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)
        
        # 应用FDR校正
        # 分别对简单p值和调整后p值进行校正
        if 'simple_p_value' in results_df.columns and not results_df['simple_p_value'].isna().all():
            _, simple_fdr = fdrcorrection(results_df['simple_p_value'].fillna(1))
            results_df['simple_fdr_p_value'] = simple_fdr
            
        if 'adjusted_p_value' in results_df.columns and not results_df['adjusted_p_value'].isna().all():
            _, adjusted_fdr = fdrcorrection(results_df['adjusted_p_value'].fillna(1))
            results_df['adjusted_fdr_p_value'] = adjusted_fdr
        
        return results_df
    else:
        return pd.DataFrame()  # 返回空DataFrame



In [38]:
# 使用示例
# 1. 首先计算元数据统计信息
stats_df = enhanced_column_stats(metadata_df)
# 
# 2. 分析目标因素与eigenspecies的关系
# results_df = analyze_target_with_confounders(eigensp_df, metadata_df, stats_df)

In [39]:
results_df = analyze_target_with_confounders(eigensp_df, metadata_df, stats_df)

In [43]:
results_df[results_df['adjusted_fdr_p_value']<0.05]

Unnamed: 0,module,target,target_type,sample_size,simple_correlation,simple_p_value,adjusted_coefficient,adjusted_p_value,adjusted_std_error,adjusted_r2,confounders_used,simple_fdr_p_value,adjusted_fdr_p_value
4,S1_C16,study_condition,Categorical,387,,,-0.976409,0.002198904,0.316661,0.163391,"disease,age,age_category,gender,country,number...",1.0,0.04557729
14,S2_C5,study_condition,Categorical,387,,,-1.49889,0.001437527,0.466788,0.132213,"disease,age,age_category,gender,country,number...",1.0,0.04557729
56,S6_C3,age,Numerical,387,0.062811,0.2176277,0.007155,0.001531584,0.002241,0.033773,"study_condition,disease,age_category,gender,co...",0.8576629,0.04557729
83,S1_C20,gender,Categorical,387,,,3.113597,0.000102045,0.792642,0.167277,"study_condition,disease,age,age_category,count...",1.0,0.007755419
93,S6_C1,gender,Categorical,387,,,0.353672,0.002139037,0.114388,0.033993,"study_condition,disease,age,age_category,count...",1.0,0.04557729
95,S1_C1,country,Categorical,387,,,1.559894,0.003069237,0.523407,0.162111,"study_condition,disease,age,age_category,gende...",1.0,0.04998472
164,S1_C8,minimum_read_length,Categorical,387,,,1.286592,0.003057447,0.431529,0.053819,"study_condition,disease,age,age_category,gende...",1.0,0.04998472
210,S1_C10,mgs_richness,Numerical,387,-0.160683,0.001516945,-0.002358,0.001997803,0.000757,0.02954,"study_condition,disease,age,age_category,gende...",0.02882195,0.04557729
215,S1_C2,mgs_richness,Numerical,387,0.19753,9.156617e-05,0.015322,0.002043205,0.004933,0.072938,"study_condition,disease,age,age_category,gende...",0.002609636,0.04557729
216,S1_C20,mgs_richness,Numerical,387,0.332334,1.969769e-11,0.052098,6.011119e-09,0.008748,0.167277,"study_condition,disease,age,age_category,gende...",4.278259e-09,6.852676e-07


In [19]:
confounders = stats_df[stats_df["Recommended as Confounder"] == True]["Column"].tolist()
confounders

['study_condition',
 'disease',
 'DNA_extraction_kit',
 'number_reads',
 'number_bases',
 'days_from_first_collection',
 'location',
 'visit_number',
 'disease_subtype']

In [22]:
numerical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                (stats_df["Data Type"] == "Numerical")]["Column"].tolist()
categorical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                    (stats_df["Data Type"] == "Categorical")]["Column"].tolist()
numerical_confounders


['number_reads', 'number_bases', 'days_from_first_collection', 'visit_number']

In [21]:
categorical_confounders

['study_condition',
 'disease',
 'DNA_extraction_kit',
 'location',
 'disease_subtype']

In [23]:
# 第1步：导入所需库
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

# 第2步：准备数据 - 假设您已经有了eigensp_df和metadata_df
# eigensp_df: 包含'module', 'sample', 'eigensp'列
# metadata_df: 包含元数据，包括'disease_subtype'列
# 确保这些数据已经正确加载

# 第3步：分析指定目标变量与eigenspecies的关系
def analyze_specific_target(eigensp_df, metadata_df, target_col, stats_df=None):
    """
    分析特定目标变量与eigenspecies的关系，同时控制其他混杂因素
    
    参数:
    eigensp_df: DataFrame，包含模块和eigenspecies信息
    metadata_df: DataFrame，包含元数据
    target_col: 要分析的目标变量名
    stats_df: 可选，元数据统计DataFrame，如果提供则使用其推荐的混杂因素
    
    返回:
    results_df: DataFrame，包含分析结果
    """
    # 如果提供了stats_df，使用其推荐的混杂因素，否则使用默认列表
    if stats_df is not None:
        # 获取推荐的混杂因素
        numerical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                        (stats_df["Data Type"] == "Numerical")]["Column"].tolist()
        categorical_confounders = stats_df[(stats_df["Recommended as Confounder"] == True) & 
                                          (stats_df["Data Type"] == "Categorical")]["Column"].tolist()
        
        # 从混杂因素中排除目标变量
        if target_col in numerical_confounders:
            numerical_confounders.remove(target_col)
        if target_col in categorical_confounders:
            categorical_confounders.remove(target_col)
    else:
        # 使用默认的混杂因素列表 - 可根据您的数据调整
        numerical_confounders = ['age', 'BMI', 'number_reads', 'number_bases', 'mgs_richness']
        categorical_confounders = ['gender', 'country', 'non_westernized', 'sequencing_platform']
    
    # 打印使用的混杂因素
    print(f"Target variable: {target_col}")
    print(f"Numerical confounders ({len(numerical_confounders)}): {', '.join(numerical_confounders)}")
    print(f"Categorical confounders ({len(categorical_confounders)}): {', '.join(categorical_confounders)}")
    
    # 确定目标变量的类型
    if target_col in metadata_df.columns:
        # 检查目标变量是数值型还是分类型
        if pd.api.types.is_numeric_dtype(metadata_df[target_col]) and metadata_df[target_col].nunique() > 10:
            target_type = "Numerical"
        else:
            target_type = "Categorical"
        print(f"Target type detected: {target_type}")
    else:
        print(f"Error: Target column '{target_col}' not found in metadata")
        return pd.DataFrame()
    
    # 所有混杂因素的列表
    all_confounders = numerical_confounders + categorical_confounders
    
    # 准备存储结果的列表
    all_results = []
    
    # 对每个模块进行分析
    for module in eigensp_df['module'].unique():
        print(f"\nAnalyzing module: {module}")
        
        # 获取该模块的eigenspecies数据
        module_df = eigensp_df[eigensp_df['module'] == module]
        print(f"  Module samples: {len(module_df)}")
        
        # 合并eigenspecies和元数据
        merged_df = module_df.merge(metadata_df, left_on='sample', right_on='sample_id', how='inner')
        print(f"  Merged samples: {len(merged_df)}")
        
        # 准备模型数据，包含目标变量、混杂因素和eigensp
        model_cols = [target_col] + all_confounders + ['eigensp']
        available_cols = [col for col in model_cols if col in merged_df.columns]
        missing_cols = [col for col in model_cols if col not in merged_df.columns]
        
        if missing_cols:
            print(f"  Missing columns: {', '.join(missing_cols)}")
        
        if target_col not in available_cols:
            print(f"  Error: Target column '{target_col}' not available after merge")
            continue
            
        # 创建工作数据集的副本
        model_data = merged_df[available_cols].copy()
        
        # 处理缺失值
        before_dropna = len(model_data)
        model_data = model_data.dropna()
        after_dropna = len(model_data)
        
        print(f"  Complete cases: {after_dropna}/{before_dropna} ({after_dropna/before_dropna*100:.1f}%)")
        
        if after_dropna < 20:  # 确保有足够的样本
            print(f"  Warning: Not enough complete cases ({after_dropna}), skipping module")
            continue
        
        # 步骤4：计算简单相关（未调整混杂因素）
        try:
            if target_type == "Numerical":
                # 数值型目标 - 使用Pearson相关
                simple_corr, simple_p = pearsonr(model_data[target_col], model_data['eigensp'])
                print(f"  Simple correlation: r={simple_corr:.3f}, p={simple_p:.5f}")
            else:
                # 分类型目标 - 展示每个类别的均值
                print("  Category means:")
                for category, group in model_data.groupby(target_col):
                    if len(group) >= 5:  # 只显示有足够样本的类别
                        print(f"    {category}: mean={group['eigensp'].mean():.3f}, n={len(group)}")
                simple_corr = None
                simple_p = None
        except Exception as e:
            print(f"  Error calculating simple correlation: {e}")
            simple_corr = None
            simple_p = None
        
        # 步骤5：使用回归模型控制混杂因素
        try:
            # 获取当前可用的混杂因素
            avail_num_conf = [col for col in numerical_confounders if col in available_cols]
            avail_cat_conf = [col for col in categorical_confounders if col in available_cols]
            
            print(f"  Available numerical confounders: {len(avail_num_conf)}")
            print(f"  Available categorical confounders: {len(avail_cat_conf)}")
            
            # 准备分类变量 - 创建哑变量
            cat_cols = avail_cat_conf
            if target_type == "Categorical":
                cat_cols.append(target_col)
                
            if cat_cols:
                print(f"  Creating dummy variables for: {', '.join(cat_cols)}")
                model_data_encoded = pd.get_dummies(model_data, columns=cat_cols, drop_first=True)
            else:
                model_data_encoded = model_data.copy()
            
            # 显示编码后的列数
            print(f"  Columns after encoding: {len(model_data_encoded.columns)}")
            
            # 为模型准备自变量（特征）和因变量
            y = model_data_encoded['eigensp']
            X_cols = [col for col in model_data_encoded.columns if col != 'eigensp']
            X = model_data_encoded[X_cols]
            
            # 添加常数项（截距）
            X = sm.add_constant(X)
            
            # 拟合线性模型
            print("  Fitting linear model...")
            model = sm.OLS(y, X).fit()
            
            # 提取结果
            print(f"  Model summary: R²={model.rsquared:.3f}, Adj. R²={model.rsquared_adj:.3f}")
            
            # 提取目标变量的系数和p值
            if target_type == "Numerical":
                # 数值型目标 - 直接获取系数
                coef = model.params.get(target_col, np.nan)
                p_value = model.pvalues.get(target_col, np.nan)
                std_error = model.bse.get(target_col, np.nan)
                
                print(f"  Adjusted result: coef={coef:.3f}, p={p_value:.5f}, se={std_error:.3f}")
            else:
                # 分类型目标 - 可能有多个哑变量
                target_cols = [col for col in model.params.index if col.startswith(f"{target_col}_")]
                print(f"  Target dummy variables: {len(target_cols)}")
                
                # 显示每个类别的系数
                for col in target_cols:
                    coef = model.params.get(col, np.nan)
                    p_value = model.pvalues.get(col, np.nan)
                    std_error = model.bse.get(col, np.nan)
                    
                    category = col.replace(f"{target_col}_", "")
                    print(f"    {category}: coef={coef:.3f}, p={p_value:.5f}, se={std_error:.3f}")
                
                # 使用第一个哑变量的结果（如果有）
                if target_cols:
                    coef = model.params[target_cols[0]]
                    p_value = model.pvalues[target_cols[0]]
                    std_error = model.bse[target_cols[0]]
                    dummy_var = target_cols[0]
                else:
                    coef = np.nan
                    p_value = np.nan
                    std_error = np.nan
                    dummy_var = None
            
            # 保存结果
            result = {
                'module': module,
                'target': target_col,
                'target_type': target_type,
                'sample_size': len(model_data),
                
                # 简单相关结果
                'simple_correlation': simple_corr,
                'simple_p_value': simple_p,
                
                # 调整后的结果
                'adjusted_coefficient': coef,
                'adjusted_p_value': p_value,
                'adjusted_std_error': std_error,
                'adjusted_r2': model.rsquared_adj,
                
                # 记录使用的混杂因素
                'numerical_confounders': ','.join(avail_num_conf),
                'categorical_confounders': ','.join(avail_cat_conf)
            }
            
            # 如果是分类变量，添加使用的哑变量信息
            if target_type == "Categorical" and dummy_var:
                result['dummy_variable'] = dummy_var
            
            all_results.append(result)
            
        except Exception as e:
            print(f"  Error in regression model: {e}")
    
    # 步骤6：创建结果DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)
        print("\nAnalysis complete. Found results for:")
        
        # 显示每个模块的结果
        for module, group in results_df.groupby('module'):
            significant = sum(group['adjusted_p_value'] < 0.05)
            print(f"  Module {module}: {len(group)} results, {significant} significant (p<0.05)")
        
        # 应用FDR校正
        if 'adjusted_p_value' in results_df.columns and not results_df['adjusted_p_value'].isna().all():
            _, adjusted_fdr = fdrcorrection(results_df['adjusted_p_value'].fillna(1))
            results_df['adjusted_fdr_p_value'] = adjusted_fdr
            
            significant_fdr = sum(results_df['adjusted_fdr_p_value'] < 0.05)
            print(f"\nAfter FDR correction: {significant_fdr} significant results (q<0.05)")
        
        return results_df
    else:
        print("\nNo valid results found")
        return pd.DataFrame()

# 步骤7：运行分析
# 假设stats_df已经由enhanced_column_stats函数生成
# results_df = analyze_specific_target(eigensp_df, metadata_df, 'disease_subtype', stats_df)

# 步骤8：查看显著结果
# significant_results = results_df[results_df['adjusted_fdr_p_value'] < 0.05].sort_values('adjusted_fdr_p_value')
# display(significant_results)

In [24]:
results_df = analyze_specific_target(eigensp_df, metadata_df, 'disease_subtype', stats_df)


Target variable: disease_subtype
Numerical confounders (4): number_reads, number_bases, days_from_first_collection, visit_number
Categorical confounders (4): study_condition, disease, DNA_extraction_kit, location
Target type detected: Categorical

Analyzing module: S1_C1
  Module samples: 393
  Merged samples: 0


ZeroDivisionError: division by zero

In [25]:
print("eigensp_df sample IDs (前5个):", eigensp_df['sample'].iloc[:5].tolist())
print("metadata_df sample IDs (前5个):", metadata_df['sample_id'].iloc[:5].tolist())

eigensp_df sample IDs (前5个): ['MH0001', 'MH0002', 'MH0003', 'MH0004', 'MH0005']
metadata_df sample IDs (前5个): ['SKST006_6_G102964', 'SKST006_7_G102965', 'SKST006_4_G102962', 'SKST006_5_G102963', 'SKST006_2_G102960']


In [28]:
common_samples = set(eigensp_df['sample']).intersection(set(metadata_df['sample_id']))
common_samples

set()

In [29]:
eigensp_df['sample']

0          MH0001
1          MH0002
2          MH0003
3          MH0004
4          MH0005
          ...    
7462    V1_UC54_0
7463    V1_UC55_0
7464    V1_UC55_4
7465    V1_UC56_0
7466    V1_UC58_0
Name: sample, Length: 7467, dtype: object

In [30]:
set(metadata_df['sample_id'])

{'SKST006_10_G102994',
 'SKST006_1_G102959',
 'SKST006_2_G102960',
 'SKST006_3_G102961',
 'SKST006_4_G102962',
 'SKST006_5_G102963',
 'SKST006_6_G102964',
 'SKST006_7_G102965',
 'SKST006_9_G103014',
 'SKST007_1_G102966',
 'SKST007_2_G102967',
 'SKST007_3_G102949',
 'SKST007_4_G102948',
 'SKST007_6_G102995',
 'SKST007_7_G103016',
 'SKST007_8_G102999',
 'SKST010_1_G102968',
 'SKST010_2_G102969',
 'SKST010_3_G102970',
 'SKST010_4_G102956',
 'SKST010_5_G102990',
 'SKST010_6_G103004',
 'SKST010_7_G103003',
 'SKST010_8_G102998',
 'SKST011_1_G102971',
 'SKST011_2_G102972',
 'SKST011_3_G102973',
 'SKST011_4_G102952',
 'SKST011_5_G102993',
 'SKST011_6_G103002',
 'SKST011_7_G103017',
 'SKST011_8_G103000',
 'SKST012_1_G102974',
 'SKST012_2_G102975',
 'SKST012_3_G102950',
 'SKST012_4_G102954',
 'SKST012_5_G102992',
 'SKST012_6_G102997',
 'SKST014_1_G102976',
 'SKST014_2_G102977',
 'SKST014_3_G102978',
 'SKST014_4_G102979',
 'SKST014_5_G102980',
 'SKST014_6_G102951',
 'SKST014_7_G102940',
 'SKST023