In [1]:
import os
import sys
import pandas as pd
import numpy as np
from scipy.stats import skew
import warnings
warnings.filterwarnings('ignore')

def enhanced_column_stats(df):
    """
    Calculate enhanced statistics for each column in the DataFrame to determine if it's suitable as a confounder
    
    Parameters:
    df: DataFrame containing metadata
    
    Returns:
    stats_df: DataFrame containing statistical information
    """
    results = []
    
    for col in df.columns:
        na_count = df[col].isna().sum()
        na_percentage = (na_count / len(df)) * 100
        dtype = df[col].dtype
        
        # Determine data type
        if pd.api.types.is_numeric_dtype(dtype):
            unique_values = df[col].dropna().unique()
            if len(unique_values) <= 10:
                data_type = 'Categorical'
            else:
                data_type = 'Numerical'
        else:
            data_type = 'Categorical'
            
        # Data distribution statistics
        if data_type == 'Numerical':
            # Statistics for numerical variables
            non_na_values = df[col].dropna()
            if len(non_na_values) > 0:
                mean_val = non_na_values.mean()
                median_val = non_na_values.median()
                std_val = non_na_values.std()
                min_val = non_na_values.min()
                max_val = non_na_values.max()
                
                # Check data skewness
                skewness = skew(non_na_values) if len(non_na_values) > 2 else 0
                
                # Check data range and distribution
                summary = f"Mean={mean_val:.2f}, Median={median_val:.2f}, SD={std_val:.2f}, Range={min_val:.2f}-{max_val:.2f}, Skew={skewness:.2f}"
                
                # Calculate coefficient of variation (CV)
                cv = std_val / abs(mean_val) if mean_val != 0 else float('inf')
                cv_comment = "High Variability" if abs(cv) > 1 else "Normal Variability"
            else:
                summary = "No valid data"
                cv = float('inf')
                cv_comment = "No data"
                skewness = 0
                
        else:
            # Statistics for categorical variables
            value_counts = df[col].dropna().value_counts()
            n_categories = len(value_counts)
            
            if not value_counts.empty:
                most_common = value_counts.index[0]
                most_common_count = value_counts.iloc[0]
                most_common_pct = (most_common_count / df[col].count()) * 100
                
                # Check category balance
                balance_ratio = value_counts.min() / value_counts.max() if len(value_counts) > 1 and value_counts.max() > 0 else 0
                balance_comment = "Imbalanced" if balance_ratio < 0.1 else "Balanced"
                
                summary = f"Categories={n_categories}, Most common='{most_common}' ({most_common_pct:.1f}%), Balance={balance_comment}"
            else:
                summary = "No valid data"
                balance_ratio = 0
                
            cv = None
            cv_comment = None
            skewness = None
        
        # Add correlation analysis - only for numerical variables
        correlations = {}
        if data_type == 'Numerical':
            for other_col in df.select_dtypes(include=['number']).columns:
                if other_col != col:
                    # Calculate correlation with other numerical variables
                    corr_data = df[[col, other_col]].dropna()
                    if len(corr_data) > 5:  # Ensure enough data points
                        correlation = corr_data.corr().iloc[0, 1]
                        if abs(correlation) > 0.3:  # Only save medium and above correlations
                            correlations[other_col] = correlation
        
        # Criteria for recommending as a confounder
        recommended = False
        recommendation_reason = []
        
        # Exclude sample IDs and certain specific non-confounder columns
        exclude_cols = ['sample_id', 'subject_id', 'NCBI_accession', 'PMID', 'curator']
        if col in exclude_cols:
            recommendation_reason.append("Identifier or metadata column")
        
        # Too many missing values is not good
        elif na_percentage > 30:
            recommendation_reason.append(f"High missing data ({na_percentage:.1f}%)")
        
        # Evaluation for numerical variables
        elif data_type == 'Numerical':
            # If coefficient of variation is too large or data is extremely skewed, transformation may be needed
            if cv is not None and abs(cv) > 3:
                recommendation_reason.append("Extreme variability")
            elif skewness is not None and abs(skewness) > 3:
                recommendation_reason.append("Highly skewed")
            else:
                recommended = True
        
        # Evaluation for categorical variables
        elif data_type == 'Categorical':
            # Too many categories is not suitable
            if n_categories > 20:
                recommendation_reason.append(f"Too many categories ({n_categories})")
            # Extremely imbalanced categories
            elif balance_ratio < 0.01 and n_categories > 1:
                recommendation_reason.append("Extremely imbalanced")
            # Single category with too high percentage
            elif 'most_common_pct' in locals() and most_common_pct > 95:
                recommendation_reason.append(f"Dominant category ({most_common_pct:.1f}%)")
            else:
                recommended = True
        
        if not recommendation_reason:
            recommendation_reason.append("Suitable as confounder")
        
        results.append({
            'Column': col,
            'Data Type': data_type,
            'NA Count': na_count,
            'NA Percentage': na_percentage,
            'Summary': summary,
            'Strong Correlations': str(correlations) if correlations else "None",
            'Recommended as Confounder': recommended,
            'Recommendation Reason': "; ".join(recommendation_reason)
        })
    
    return pd.DataFrame(results)

def detect_duplicate_variables(df, stats_df, threshold=0.9):
    """
    Detect potential duplicate variables (semantically identical but differently encoded)
    
    Parameters:
    df: Metadata DataFrame
    stats_df: Statistics DataFrame
    threshold: Detection threshold
    
    Returns:
    duplicate_groups: List of duplicate variable groups
    """
    # Get all categorical variables
    categorical_vars = stats_df[stats_df['Data Type'] == 'Categorical']['Column'].tolist()
    
    # Initialize results
    duplicate_pairs = []
    
    # Check each pair of categorical variables
    for i, var1 in enumerate(categorical_vars):
        for var2 in categorical_vars[i+1:]:
            # Skip variable pairs with too many missing values
            if df[var1].isna().mean() > 0.3 or df[var2].isna().mean() > 0.3:
                continue
                
            # Create cross-tabulation
            valid_data = df[[var1, var2]].dropna()
            if len(valid_data) < 10:  # Too few samples
                continue
                
            try:
                # Check mapping relationship between two variables
                mapping_consistency = 0
                var1_to_var2 = {}
                
                # Create cross-tabulation
                crosstab = pd.crosstab(valid_data[var1], valid_data[var2])
                
                # For each value of var1, find the most commonly mapped value of var2
                total_correct_mapping = 0
                
                for val1 in crosstab.index:
                    val1_total = crosstab.loc[val1].sum()
                    if val1_total == 0:
                        continue
                        
                    # Find the most common mapping
                    max_val2 = crosstab.loc[val1].idxmax()
                    max_count = crosstab.loc[val1, max_val2]
                    
                    # Calculate mapping consistency
                    consistency = max_count / val1_total
                    
                    if consistency >= threshold:
                        var1_to_var2[val1] = {
                            'mapped_to': max_val2,
                            'consistency': consistency,
                            'count': max_count,
                            'total': val1_total
                        }
                        total_correct_mapping += max_count
                
                # If consistent mapping relationships found
                if var1_to_var2:
                    # Calculate overall mapping consistency score
                    overall_consistency = total_correct_mapping / len(valid_data)
                    
                    if overall_consistency >= threshold:
                        duplicate_pairs.append((var1, var2, overall_consistency))
                        
            except Exception as e:
                print(f"Error checking {var1} and {var2}: {str(e)}")
    
    # Merge duplicate pairs into groups
    duplicate_groups = []
    processed = set()
    
    for var1, var2, consistency in duplicate_pairs:
        if var1 in processed or var2 in processed:
            # Find existing group
            for group in duplicate_groups:
                if var1 in group or var2 in group:
                    if var1 not in group:
                        group.append(var1)
                    if var2 not in group:
                        group.append(var2)
                    processed.add(var1)
                    processed.add(var2)
                    break
        else:
            # Create new group
            duplicate_groups.append([var1, var2])
            processed.add(var1)
            processed.add(var2)
    
    return duplicate_groups

def process_cohort(disease, cohort, data_dir, output_dir):
    """
    Process data for a single cohort, generate confounder summary
    
    Parameters:
    disease: Disease name
    cohort: Cohort name
    data_dir: Data directory
    output_dir: Output directory
    
    Returns:
    success: Whether processing was successful
    """
    try:
        # Build file paths
        metadata_path = os.path.join(data_dir, disease, cohort, 'metadata.tsv')
        eigensp_path = os.path.join(output_dir, disease, cohort, 'eigenspecies', f'{cohort}.eigenspecies.tsv')
        output_path = os.path.join(output_dir, disease, cohort, 'phenotype')
        
        # Ensure output directory exists
        os.makedirs(output_path, exist_ok=True)
        
        # Read metadata
        if os.path.exists(metadata_path):
            try:
                metadata_df = pd.read_csv(metadata_path, sep='\t', header=0)
                print(f"Processing {disease}/{cohort}: {len(metadata_df)} samples in metadata")
            except Exception as e:
                print(f"Error reading metadata for {disease}/{cohort}: {str(e)}")
                return False
        else:
            print(f"Metadata file not found for {disease}/{cohort}: {metadata_path}")
            return False
        
        # Calculate confounder statistics
        stats_df = enhanced_column_stats(metadata_df)
        
        # Detect duplicate variables
        duplicate_groups = detect_duplicate_variables(metadata_df, stats_df)
        
        # Create summary information
        summary_dict = {
            'disease': disease,
            'cohort': cohort,
            'samples': len(metadata_df),
            'metadata_variables': len(metadata_df.columns),
            'recommended_confounders': sum(stats_df['Recommended as Confounder']),
            'duplicate_variable_groups': len(duplicate_groups)
        }
        
        # Convert to DataFrame
        summary_df = pd.DataFrame([summary_dict])
        
        # Save confounder statistics
        stats_df.to_csv(os.path.join(output_path, 'confounder.stats.tsv'), sep='\t', index=False)
        
        # Save recommended confounders list
        recommended_confounders = stats_df[stats_df['Recommended as Confounder']]['Column'].tolist()
        with open(os.path.join(output_path, 'recommended_confounders.txt'), 'w') as f:
            f.write('\n'.join(recommended_confounders))
        
        # Save duplicate variable group information
        if duplicate_groups:
            duplicate_df = pd.DataFrame({
                'Group_ID': range(1, len(duplicate_groups) + 1),
                'Variables': [', '.join(group) for group in duplicate_groups],
                'Count': [len(group) for group in duplicate_groups]
            })
            duplicate_df.to_csv(os.path.join(output_path, 'duplicate_variables.tsv'), sep='\t', index=False)
        
        # Save summary information
        summary_df.to_csv(os.path.join(output_path, 'cofounder.summary.tsv'), sep='\t', index=False)
        
        print(f"Successfully processed {disease}/{cohort}: {summary_dict['recommended_confounders']} recommended confounders")
        return True
        
    except Exception as e:
        print(f"Error processing {disease}/{cohort}: {str(e)}")
        return False

In [2]:
# Disease list
plist = [
    'ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D',
    'adenoma', 'asthma', 'carcinoma_surgery_history', 'hypertension',
    'migraine', 'schizofrenia'
]

# Directory paths
data_dir = '../data'
output_dir = '../result/large_scale_cohort'

# Statistics
total_cohorts = 0
processed_cohorts = 0

# Process metadata files for all diseases and cohorts
for disease in plist:
    disease_dir = os.path.join(data_dir, disease)
    
    if not os.path.isdir(disease_dir):
        print(f"Disease directory not found: {disease_dir}")
        continue
    
    for cohort in os.listdir(disease_dir):
        cohort_dir = os.path.join(disease_dir, cohort)
        
        if not os.path.isdir(cohort_dir):
            continue
        
        total_cohorts += 1
        success = process_cohort(disease, cohort, data_dir, output_dir)
        if success:
            processed_cohorts += 1

print(f"\nProcessing complete. Successfully processed {processed_cohorts}/{total_cohorts} cohorts.")


plist = [
    'ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D',
    'adenoma', 'asthma', 'carcinoma_surgery_history', 'hypertension',
    'migraine', 'schizofrenia'
]

# Output directory
output_dir = '../result/large_scale_cohort'

# Results summary
all_results = []

# Iterate through all diseases and cohorts
for disease in plist:
    disease_dir = os.path.join(output_dir, disease)
    
    # Check if disease directory exists
    if not os.path.isdir(disease_dir):
        print(f"Disease directory not found: {disease_dir}")
        continue
    
    # Iterate through all cohorts for this disease
    for cohort in os.listdir(disease_dir):
        cohort_dir = os.path.join(disease_dir, cohort)
        
        # Ensure it's a directory not a file
        if not os.path.isdir(cohort_dir):
            continue
        
        # Build path to recommended confounders file
        conf_file = os.path.join(cohort_dir, 'phenotype', 'recommended_confounders.txt')
        
        # Check if file exists
        if not os.path.exists(conf_file):
            print(f"Recommended confounders file not found for {disease}/{cohort}: {conf_file}")
            continue
        
        # Read recommended confounders
        try:
            with open(conf_file, 'r') as f:
                confounders = [line.strip() for line in f if line.strip()]
            
            # Add to results
            all_results.append({
                'Disease': disease,
                'Cohort': cohort,
                'Confounders': confounders,
                'Count': len(confounders)
            })
            
        except Exception as e:
            print(f"Error reading recommended confounders for {disease}/{cohort}: {str(e)}")

# Sort by disease and cohort name
all_results.sort(key=lambda x: (x['Disease'], x['Cohort']))

# Print results
print(f"\n{'='*100}")
print(f"{'Disease':<15} {'Cohort':<15} {'Count':<7} {'Recommended Confounders'}")
print(f"{'-'*100}")

for result in all_results:
    # Format confounder list, truncate if too long
    conf_str = ', '.join(result['Confounders'])
    if len(conf_str) > 60:
        conf_str = conf_str[:57] + '...'
        
    print(f"{result['Disease']:<15} {result['Cohort']:<15} {result['Count']:<7} {conf_str}")

print(f"{'='*100}")

# Generate summary statistics
total_cohorts = len(all_results)
total_unique_confounders = set()
for result in all_results:
    total_unique_confounders.update(result['Confounders'])

# Calculate how many cohorts each confounder appears in
confounder_counts = {}
for result in all_results:
    for conf in result['Confounders']:
        confounder_counts[conf] = confounder_counts.get(conf, 0) + 1

# Sort and print the most common confounders
top_confounders = sorted(confounder_counts.items(), key=lambda x: x[1], reverse=True)[:20]

print(f"\nTotal cohorts analyzed: {total_cohorts}")
print(f"Total unique confounders across all cohorts: {len(total_unique_confounders)}")

print("\nTop 20 most common confounders:")
print(f"{'Confounder':<30} {'Count':<8} {'Percentage':<10}")
print(f"{'-'*50}")
for conf, count in top_confounders:
    percentage = (count / total_cohorts) * 100
    print(f"{conf:<30} {count:<8} {percentage:.1f}%")

# Define a list of confounders to exclude
discard_phenotype = [
    'study_condition', # same as disease
    'number_reads', # related to number_bases
    'age_category', # related to age
    'median_read_length', # not relate to host factor
    'minimum_read_length', # not relate to host factor
    'disease_subtype', # related to disease
    'study_name' # usually same in one cohort
]

Processing ACVD/ACVD1: 385 samples in metadata
Successfully processed ACVD/ACVD1: 11 recommended confounders
Processing BD/BD1: 65 samples in metadata
Successfully processed BD/BD1: 7 recommended confounders
Processing CRC/CRC1: 55 samples in metadata
Successfully processed CRC/CRC1: 10 recommended confounders
Processing CRC/CRC2: 60 samples in metadata
Successfully processed CRC/CRC2: 11 recommended confounders
Processing CRC/CRC3: 80 samples in metadata
Successfully processed CRC/CRC3: 8 recommended confounders
Processing CRC/CRC4: 83 samples in metadata
Successfully processed CRC/CRC4: 16 recommended confounders
Processing CRC/CRC5: 110 samples in metadata
Successfully processed CRC/CRC5: 10 recommended confounders
Processing CRC/CRC6: 114 samples in metadata
Successfully processed CRC/CRC6: 11 recommended confounders
Processing CRC/CRC7: 125 samples in metadata
Successfully processed CRC/CRC7: 10 recommended confounders
Processing CRC/CRC8: 504 samples in metadata
Successfully proc

In [3]:
def analyze_target_with_confounders(eigensp_df, metadata_df, recommended_confounders, discard_list=None):
    """
    Analyze the relationship between each target factor and eigenspecies, while controlling for other confounding factors
    
    Parameters:
    eigensp_df: DataFrame containing modules and eigenspecies information
    metadata_df: DataFrame containing metadata information
    recommended_confounders: list of recommended confounding factors
    discard_list: list of confounding factors to exclude
    
    Returns:
    results_df: DataFrame containing analysis results
    """
    import numpy as np
    import pandas as pd
    import statsmodels.api as sm
    from scipy.stats import pearsonr
    from statsmodels.stats.multitest import fdrcorrection
    
    # Filter out unwanted confounders
    if discard_list:
        confounders = [col for col in recommended_confounders if col not in discard_list]
    else:
        confounders = recommended_confounders.copy()
    
    print(f"Using {len(confounders)} confounders after filtering: {confounders}")
    
    # Determine data types
    data_types = {}
    for col in confounders:
        if col in metadata_df.columns:
            # Simple data type determination
            if pd.api.types.is_numeric_dtype(metadata_df[col]):
                unique_values = metadata_df[col].dropna().unique()
                if len(unique_values) <= 10:
                    data_types[col] = "Categorical"
                else:
                    data_types[col] = "Numerical"
            else:
                data_types[col] = "Categorical"
    
    # Separate numerical and categorical confounders
    numerical_confounders = [col for col, dtype in data_types.items() if dtype == "Numerical"]
    categorical_confounders = [col for col, dtype in data_types.items() if dtype == "Categorical"]
    
    print(f"Numerical confounders: {numerical_confounders}")
    print(f"Categorical confounders: {categorical_confounders}")
    
    # Prepare list to store results
    all_results = []
    
    # Check eigensp_df structure
    required_cols = ['module', 'eigensp']
    sample_id_col = None
    
    # Try to find sample ID column
    for col in ['sample', 'sample_id', 'ID']:
        if col in eigensp_df.columns:
            sample_id_col = col
            break
    
    if sample_id_col is None:
        if len(eigensp_df.columns) >= 3:
            # Assume first column is sample ID, apart from module and eigensp columns
            other_cols = [col for col in eigensp_df.columns if col not in ['module', 'eigensp']]
            sample_id_col = other_cols[0]
            print(f"Using '{sample_id_col}' as sample ID column")
        else:
            raise ValueError("Cannot determine sample ID column in eigensp_df")
    
    # Ensure metadata_df has an index column or a column that can match eigensp_df
    metadata_sample_col = None
    if 'sample_id' in metadata_df.columns:
        metadata_sample_col = 'sample_id'
    elif sample_id_col in metadata_df.columns:
        metadata_sample_col = sample_id_col
    
    # For each confounder, take turns making it the target factor with the others as control variables
    for target_col in confounders:
        if target_col not in metadata_df.columns:
            print(f"Warning: Target column '{target_col}' not found in metadata. Skipping.")
            continue
            
        # Determine the data type of the current target
        target_type = data_types.get(target_col)
        if not target_type:
            print(f"Warning: Cannot determine data type for '{target_col}'. Skipping.")
            continue
            
        print(f"\nAnalyzing target: {target_col} ({target_type})")
        
        # Exclude current target, use rest as confounders
        current_confounders = [col for col in confounders if col != target_col and col in metadata_df.columns]
        
        # Analyze each module
        for module in eigensp_df['module'].unique():
            module_df = eigensp_df[eigensp_df['module'] == module].copy()
            
            # Merge eigenspecies and metadata
            try:
                if metadata_sample_col:
                    # Merge using specified sample ID column
                    merged_df = pd.merge(
                        module_df,
                        metadata_df,
                        left_on=sample_id_col,
                        right_on=metadata_sample_col,
                        how='inner'
                    )
                else:
                    # Try to merge using index
                    merged_df = pd.merge(
                        module_df,
                        metadata_df,
                        left_on=sample_id_col,
                        right_index=True,
                        how='inner'
                    )
                
                # Check data quantity after merging
                if len(merged_df) < 10:
                    print(f"  Module {module}: Too few samples after merging ({len(merged_df)}). Skipping.")
                    continue
                    
            except Exception as e:
                print(f"  Error merging data for module {module}: {str(e)}")
                continue
                
            # Prepare model data, ensure all needed columns are included
            model_cols = [target_col] + current_confounders + ['eigensp']
            available_cols = [col for col in model_cols if col in merged_df.columns]
            
            # Check if there's enough data
            if len(available_cols) < len(model_cols) - 5:  # Allow missing a few columns
                print(f"  Module {module}: Too many missing columns. Skipping.")
                continue
                
            model_data = merged_df[available_cols].copy()
            
            # Handle missing values
            model_data = model_data.dropna()
            
            if len(model_data) < 20:  # Ensure enough samples
                print(f"  Module {module}: Too few samples after removing NA values ({len(model_data)}). Skipping.")
                continue
            
            # 1. First calculate simple correlation coefficient (without adjusting for confounders)
            try:
                if target_type == "Numerical":
                    # Numerical target - use Pearson correlation
                    simple_corr, simple_p = pearsonr(model_data[target_col], model_data['eigensp'])
                else:
                    # Categorical target - calculate group differences
                    categories = model_data[target_col].unique()
                    if len(categories) < 2:
                        print(f"  Module {module}: Target '{target_col}' has only one category. Skipping.")
                        continue
                        
                    # Calculate eta-squared as effect size
                    group_means = {cat: model_data[model_data[target_col] == cat]['eigensp'].mean() 
                                 for cat in categories}
                    
                    # Calculate overall mean
                    overall_mean = model_data['eigensp'].mean()
                    
                    # Calculate between-group variance
                    between_ss = sum(len(model_data[model_data[target_col] == cat]) * 
                                    (mean - overall_mean)**2 
                                    for cat, mean in group_means.items())
                    
                    # Calculate total variance
                    total_ss = sum((model_data['eigensp'] - overall_mean)**2)
                    
                    # Calculate effect size (similar to R²)
                    if total_ss > 0:
                        simple_corr = between_ss / total_ss  # This is actually eta²
                    else:
                        simple_corr = 0
                        
                    # Use ANOVA test to calculate p-value
                    from scipy import stats
                    groups = [model_data[model_data[target_col] == cat]['eigensp'].values 
                             for cat in categories if len(model_data[model_data[target_col] == cat]) > 0]
                    
                    if len(groups) >= 2:
                        try:
                            f_stat, simple_p = stats.f_oneway(*groups)
                        except:
                            simple_p = None
                    else:
                        simple_p = None
            except Exception as e:
                print(f"  Error calculating simple correlation for module {module}, target {target_col}: {str(e)}")
                simple_corr = None
                simple_p = None
            
            # 2. Use regression model to control for confounders
            try:
                # Build model formula
                formula_parts = ['eigensp ~ ']
                
                # Add target variable
                if target_type == "Categorical":
                    formula_parts.append(f"C({target_col})")
                else:
                    formula_parts.append(target_col)
                
                # Add confounding factors
                for conf in current_confounders:
                    if conf in model_data.columns:
                        if conf in categorical_confounders:
                            formula_parts.append(f"C({conf})")
                        else:
                            formula_parts.append(conf)
                
                # Build complete formula
                formula = ' + '.join(formula_parts)
                
                # Fit model
                import statsmodels.formula.api as smf
                model = smf.ols(formula, data=model_data).fit()
                
                # Extract target variable coefficients and p-values
                if target_type == "Numerical":
                    # Numerical target - directly get coefficient
                    coef = model.params.get(target_col, np.nan)
                    p_value = model.pvalues.get(target_col, np.nan)
                    std_error = model.bse.get(target_col, np.nan)
                else:
                    # Categorical target - look for parameters related to target variable
                    target_params = [p for p in model.params.index if target_col in p and p != target_col]
                    
                    if target_params:
                        # Use first category parameter
                        param = target_params[0]
                        coef = model.params.get(param, np.nan)
                        p_value = model.pvalues.get(param, np.nan)
                        std_error = model.bse.get(param, np.nan)
                    else:
                        coef = np.nan
                        p_value = np.nan
                        std_error = np.nan
                
                print(f"  Module {module}: n={len(model_data)}, coef={coef:.3f}, p={p_value:.3e}")
                
                # Save results
                result = {
                    'module': module,
                    'target': target_col,
                    'target_type': target_type,
                    'sample_size': len(model_data),
                    
                    # Simple correlation results
                    'simple_correlation': simple_corr,
                    'simple_p_value': simple_p,
                    
                    # Adjusted results
                    'adjusted_coefficient': coef,
                    'adjusted_p_value': p_value,
                    'adjusted_std_error': std_error,
                    'adjusted_r2': model.rsquared_adj,
                    
                    # Record confounders used
                    'confounders_used': ','.join([col for col in current_confounders if col in model_data.columns])
                }
                
                all_results.append(result)
                
            except Exception as e:
                print(f"  Error in regression for module {module}, target {target_col}: {str(e)}")
                continue
    
    # Create results DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)
        
        # Apply FDR correction
        # Correct simple p-values and adjusted p-values separately
        if 'simple_p_value' in results_df.columns and not results_df['simple_p_value'].isna().all():
            _, simple_fdr = fdrcorrection(results_df['simple_p_value'].fillna(1))
            results_df['simple_fdr_p_value'] = simple_fdr
            
        if 'adjusted_p_value' in results_df.columns and not results_df['adjusted_p_value'].isna().all():
            _, adjusted_fdr = fdrcorrection(results_df['adjusted_p_value'].fillna(1))
            results_df['adjusted_fdr_p_value'] = adjusted_fdr
        
        # Add significance markers
        results_df['significant'] = results_df['adjusted_fdr_p_value'] < 0.05
        
        return results_df
    else:
        print("No results generated.")
        return pd.DataFrame()  # Return empty DataFrame

In [4]:
def process_cohort(disease, cohort, data_dir, output_dir, discard_list):
    """
    Process data for a single cohort
    
    Parameters:
    disease: Disease name
    cohort: Cohort name
    data_dir: Data directory
    output_dir: Output directory
    discard_list: List of confounding factors to exclude
    
    Returns:
    success: Whether processing was successful
    """
    try:
        print(f"\n{'='*80}\nProcessing {disease}/{cohort}\n{'='*80}")
        
        # Build file paths
        metadata_path = os.path.join(data_dir, disease, cohort, 'metadata.tsv')
        eigensp_path = os.path.join(output_dir, disease, cohort, 'eigenspecies', f'{cohort}.eigenspecies.csv')
        confounders_path = os.path.join(output_dir, disease, cohort, 'phenotype', 'recommended_confounders.txt')
        output_path = os.path.join(output_dir, disease, cohort, 'phenotype')
        
        # Ensure output directory exists
        os.makedirs(output_path, exist_ok=True)
        metadata_df = pd.read_csv(metadata_path, sep='\t', header=0)
        print(f"Read metadata: {len(metadata_df)} samples, {len(metadata_df.columns)} variables")
        eigensp_df = pd.read_csv(eigensp_path,index_col=0,header=0,sep="\t")
        print(f"Read eigenspecies: {len(eigensp_df)} rows, modules: {eigensp_df['module'].nunique()}")


            
        # Read recommended confounders
        with open(confounders_path, 'r') as f:
            recommended_confounders = [line.strip() for line in f if line.strip()]
            
        print(f"Read {len(recommended_confounders)} recommended confounders")
            
        # Filter out unwanted confounders
        filtered_confounders = [col for col in recommended_confounders if col not in discard_list]
        print(f"After filtering: {len(filtered_confounders)} confounders")
            
        # Check if these columns exist in the metadata
        available_confounders = [col for col in filtered_confounders if col in metadata_df.columns]
        if len(available_confounders) < len(filtered_confounders):
            missing = set(filtered_confounders) - set(available_confounders)
            print(f"Warning: {len(missing)} confounders not found in metadata: {missing}")
        
        # Perform analysis
        results = analyze_target_with_confounders(eigensp_df, metadata_df, available_confounders, discard_list)
        
        if not results.empty:
            # Save results
            results_path = os.path.join(output_path, 'eigenspecies_target_analysis.tsv')
            results.to_csv(results_path, sep='\t', index=False)
            print(f"Results saved to: {results_path}")
            
            # Save summary of significant results
            if 'significant' in results.columns:
                sig_results = results[results['significant']]
                if not sig_results.empty:
                    sig_path = os.path.join(output_path, 'significant_associations.tsv')
                    sig_results.to_csv(sig_path, sep='\t', index=False)
                    print(f"Significant associations saved to: {sig_path}")
                    print(f"Found {len(sig_results)} significant associations")
            
            return True
        else:
            print("No results generated for this cohort")
            return False
            
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        return False
            
    except Exception as e:
        print(f"Error processing cohort {disease}/{cohort}: {str(e)}")
        return False

plist = [
    'ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D',
    'adenoma', 'asthma', 'carcinoma_surgery_history', 'hypertension',
    'migraine', 'schizofrenia'
]

# Directory paths
data_dir = '../data'
output_dir = '../result/large_scale_cohort'


# Statistics
total_cohorts = 0
processed_cohorts = 0

# Iterate through all diseases and cohorts
for disease in plist:
    disease_dir = os.path.join(data_dir, disease)
    
    # Check if disease directory exists
    if not os.path.isdir(disease_dir):
        print(f"Disease directory not found: {disease_dir}")
        continue
    
    # Iterate through all cohorts for this disease
    for cohort in os.listdir(disease_dir):
        cohort_dir = os.path.join(disease_dir, cohort)
        
        # Ensure it's a directory not a file
        if not os.path.isdir(cohort_dir):
            continue
        
        total_cohorts += 1
        success = process_cohort(disease, cohort, data_dir, output_dir, discard_phenotype)
        if success:
            processed_cohorts += 1

print(f"\nProcessing complete. Successfully processed {processed_cohorts}/{total_cohorts} cohorts.")


plist = [
    'ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D',
    'adenoma', 'asthma', 'carcinoma_surgery_history', 'hypertension',
    'migraine', 'schizofrenia'
]

# Directory paths
data_dir = '../data'
output_dir = '../result/large_scale_cohort'


# Statistics
total_cohorts = 0
processed_cohorts = 0

# Iterate through all diseases and cohorts
for disease in plist:
    disease_dir = os.path.join(data_dir, disease)
    
    # Check if disease directory exists
    if not os.path.isdir(disease_dir):
        print(f"Disease directory not found: {disease_dir}")
        continue
    
    # Iterate through all cohorts for this disease
    for cohort in os.listdir(disease_dir):
        cohort_dir = os.path.join(disease_dir, cohort)
        
        # Ensure it's a directory not a file
        if not os.path.isdir(cohort_dir):
            continue
        
        total_cohorts += 1
        success = process_cohort(disease, cohort, data_dir, output_dir, discard_phenotype)
        if success:
            processed_cohorts += 1

print(f"\nProcessing complete. Successfully processed {processed_cohorts}/{total_cohorts} cohorts.")


Processing ACVD/ACVD1
Read metadata: 385 samples, 130 variables
Read eigenspecies: 8085 rows, modules: 21
Read 11 recommended confounders
After filtering: 8 confounders
Using 8 confounders after filtering: ['disease', 'age', 'gender', 'number_bases', 'triglycerides', 'hdl', 'ldl', 'cholesterol']
Numerical confounders: ['age', 'number_bases', 'triglycerides', 'hdl', 'ldl', 'cholesterol']
Categorical confounders: ['disease', 'gender']

Analyzing target: disease (Categorical)
  Module S1_C1: n=336, coef=-2.491, p=2.322e-05
  Module S1_C10: n=336, coef=0.020, p=9.072e-01
  Module S1_C13: n=336, coef=-0.136, p=7.357e-01
  Module S1_C14: n=336, coef=0.154, p=8.200e-03
  Module S1_C15: n=336, coef=0.293, p=5.351e-01
  Module S1_C16: n=336, coef=-0.395, p=9.216e-02
  Module S1_C17: n=336, coef=-0.022, p=3.685e-01
  Module S1_C2: n=336, coef=-2.357, p=1.838e-02
  Module S1_C20: n=336, coef=-0.672, p=3.889e-01
  Module S1_C3: n=336, coef=6.539, p=1.113e-03
  Module S1_C4: n=336, coef=-0.036, p=

In [5]:
def merge_significant_associations():
    """
    Iterate through all cohort folders and merge all significant_associations.tsv files
    
    Returns:
    Merged DataFrame
    """
    # Disease list
    plist = [
        'ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D',
        'adenoma', 'asthma', 'carcinoma_surgery_history', 'hypertension',
        'migraine', 'schizofrenia'
    ]
    
    # Output directory
    output_dir = '../result/large_scale_cohort'
    
    # Save all found files
    all_files = []
    
    # Iterate through all diseases and cohorts
    for disease in plist:
        disease_dir = os.path.join(output_dir, disease)
        
        # Check if disease directory exists
        if not os.path.isdir(disease_dir):
            print(f"Disease directory not found: {disease_dir}")
            continue
        
        # Iterate through all cohorts for this disease
        for cohort in os.listdir(disease_dir):
            cohort_dir = os.path.join(disease_dir, cohort)
            
            # Ensure it's a directory not a file
            if not os.path.isdir(cohort_dir):
                continue
            
            # Build results file path
            sig_file = os.path.join(cohort_dir, 'phenotype', 'significant_associations.tsv')
            
            # Check if file exists
            if os.path.exists(sig_file):
                all_files.append((disease, cohort, sig_file))
                print(f"Found: {disease}/{cohort}")
            else:
                # Check alternative naming
                alt_file = os.path.join(cohort_dir, 'phenotype', 'significant_association.txt')
                if os.path.exists(alt_file):
                    all_files.append((disease, cohort, alt_file))
                    print(f"Found (alt): {disease}/{cohort}")
    
    print(f"\nFound {len(all_files)} files with significant associations")
    
    # Merge all files
    all_data = []
    
    for disease, cohort, file_path in all_files:
        try:
            # Try to read the file
            if file_path.endswith('.tsv'):
                df = pd.read_csv(file_path, sep='\t')
            else:
                # Try different separators
                try:
                    df = pd.read_csv(file_path, sep='\t')
                except:
                    df = pd.read_csv(file_path, sep=',')
            
            # Add disease and cohort information
            df['disease'] = disease
            df['cohort'] = cohort
            
            # Add to list
            all_data.append(df)
            
            print(f"Added {len(df)} rows from {disease}/{cohort}")
            
        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}")
    
    # Check if there's any data
    if not all_data:
        print("No data found!")
        return None
    
    # Merge all data
    merged_df = pd.concat(all_data, ignore_index=True)
    
    # Ensure column order
    cols = ['disease', 'cohort']
    other_cols = [col for col in merged_df.columns if col not in cols]
    merged_df = merged_df[cols + other_cols]
    
    print(f"\nMerged data: {len(merged_df)} rows, {len(merged_df.columns)} columns")
    
    return merged_df

merged_df = merge_significant_associations()

if merged_df is not None:
    # Save merged file
    #output_path = '../result/large_scale_cohort/all_significant_associations.tsv'
    #merged_df.to_csv(output_path, sep='\t', index=False)
    #print(f"Results saved to: {output_path}")
    
    # Count significant associations for each disease and cohort
    summary = merged_df.groupby(['disease', 'cohort']).size().reset_index(name='count')
    summary = summary.sort_values(['disease', 'count'], ascending=[True, False])
    
    # Save summary
    #summary_path = '../result/large_scale_cohort/significant_associations_summary.tsv'
    #summary.to_csv(summary_path, sep='\t', index=False)
    #print(f"Summary saved to: {summary_path}")
    
    # Print summary statistics
    print("\nSummary by disease:")
    disease_summary = merged_df.groupby('disease').size().reset_index(name='count')
    disease_summary = disease_summary.sort_values('count', ascending=False)
    for _, row in disease_summary.iterrows():
        print(f"{row['disease']}: {row['count']} significant associations")
    
    # Count target factors
    if 'target' in merged_df.columns:
        print("\nTop targets with significant associations:")
        target_summary = merged_df.groupby('target').size().reset_index(name='count')
        target_summary = target_summary.sort_values('count', ascending=False).head(10)
        for _, row in target_summary.iterrows():
            print(f"{row['target']}: {row['count']} associations")
    
    # Count modules
    if 'module' in merged_df.columns:
        print("\nTop modules with significant associations:")
        module_summary = merged_df.groupby('module').size().reset_index(name='count')
        module_summary = module_summary.sort_values('count', ascending=False).head(10)
        for _, row in module_summary.iterrows():
            print(f"{row['module']}: {row['count']} associations")




merged_df = merge_significant_associations()

if merged_df is not None:
    # Save merged file
    #output_path = '../result/large_scale_cohort/all_significant_associations.tsv'
    #merged_df.to_csv(output_path, sep='\t', index=False)
    #print(f"Results saved to: {output_path}")
    
    # Count significant associations for each disease and cohort
    summary = merged_df.groupby(['disease', 'cohort']).size().reset_index(name='count')
    summary = summary.sort_values(['disease', 'count'], ascending=[True, False])
    
    # Save summary
    #summary_path = '../result/large_scale_cohort/significant_associations_summary.tsv'
    #summary.to_csv(summary_path, sep='\t', index=False)
    #print(f"Summary saved to: {summary_path}")
    
    # Print summary statistics
    print("\nSummary by disease:")
    disease_summary = merged_df.groupby('disease').size().reset_index(name='count')
    disease_summary = disease_summary.sort_values('count', ascending=False)
    for _, row in disease_summary.iterrows():
        print(f"{row['disease']}: {row['count']} significant associations")
    
    # Count target factors
    if 'target' in merged_df.columns:
        print("\nTop targets with significant associations:")
        target_summary = merged_df.groupby('target').size().reset_index(name='count')
        target_summary = target_summary.sort_values('count', ascending=False).head(10)
        for _, row in target_summary.iterrows():
            print(f"{row['target']}: {row['count']} associations")
    
    # Count modules
    if 'module' in merged_df.columns:
        print("\nTop modules with significant associations:")
        module_summary = merged_df.groupby('module').size().reset_index(name='count')
        module_summary = module_summary.sort_values('count', ascending=False).head(10)
        for _, row in module_summary.iterrows():
            print(f"{row['module']}: {row['count']} associations")

Found: ACVD/ACVD1
Found: CRC/CRC2
Found: CRC/CRC5
Found: CRC/CRC8
Found: CRC/CRC9
Found: IBD/IBD1
Found: IBD/IBD2
Found: IBD/IBD3
Found: IGT/IGT2
Found: T2D/T2D1
Found: T2D/T2D2
Found: adenoma/adenoma3
Found: asthma/asthma1
Found: carcinoma_surgery_history/carcinoma_surgery_history1
Found: migraine/migraine1
Found: schizofrenia/schizofrenia1

Found 16 files with significant associations
Added 2 rows from ACVD/ACVD1
Added 1 rows from CRC/CRC2
Added 3 rows from CRC/CRC5
Added 11 rows from CRC/CRC8
Added 2 rows from CRC/CRC9
Added 19 rows from IBD/IBD1
Added 21 rows from IBD/IBD2
Added 46 rows from IBD/IBD3
Added 22 rows from IGT/IGT2
Added 4 rows from T2D/T2D1
Added 2 rows from T2D/T2D2
Added 3 rows from adenoma/adenoma3
Added 1 rows from asthma/asthma1
Added 3 rows from carcinoma_surgery_history/carcinoma_surgery_history1
Added 1 rows from migraine/migraine1
Added 4 rows from schizofrenia/schizofrenia1

Merged data: 145 rows, 16 columns

Summary by disease:
IBD: 86 significant associati