Eigenspecies analysis of antibiotic cohort
- prepare group file for comparison pairs, two groups in one comparison
- calculate eigenspecies of all FRCs in all samples in two groups
- construct eigenspecies correlation network for two groups respectively
- preservation matrix of correlation matrices between two groups
- compare eigenspecies networks difference between two groups


In [None]:
import pandas as pd
import os
import numpy as np


# Create group file for every comparison
os.makedirs("../result/Anti/eigenspecies", exist_ok=True)

anti_df = pd.read_csv("../data/Anti/Anti.group.tsv", sep="\t")

all_dict = {}
for _, row in anti_df.iterrows():
    disease_day = row['disease_day']
    ecc_day = row['ECC_day']
    sample_id = row['sample_id']
    
    if disease_day not in all_dict:
        all_dict[disease_day] = {}
    all_dict[disease_day][sample_id] = 1
    
    if pd.notna(ecc_day):
        if ecc_day not in all_dict:
            all_dict[ecc_day] = {}
        all_dict[ecc_day][sample_id] = 1

compare_df = pd.read_csv("../data/Anti/Anti.compare.list", sep="\t", header=None, names=['g1', 'g2'])

# Process each comparison
for _, row in compare_df.iterrows():
    g1, g2 = row['g1'], row['g2']
    
    # Create output dataframe
    output_data = []
    
    # Process each group
    for g in [g1, g2]:
        # Get samples for this group
        if g in all_dict:
            samples = sorted(all_dict[g].keys())
            
            # Add each sample to output data
            for sample in samples:
                output_data.append({
                    'sample_id': sample,
                    'group': g
                })
    
    # Create and save the output dataframe
    output_df = pd.DataFrame(output_data)
    output_path = f"../result/Anti/eigenspecies/{g1}.{g2}.group.tsv"
    output_df.to_csv(output_path, sep="\t", index=False)
    
    print(f"Created file: {output_path} with {len(output_df)} samples")

In [2]:
import seaborn as sns
from matplotlib import pyplot as plt
from eigenspecies_utils import calculate_eigenspecies, eigenspecies_correlation_network, get_preserv_matrix, compare_eigenspecies_networks, calculate_eigenspecies_together

def analyze_eigenspecies(g1, g2, base_path="../result/Anti/eigenspecies", expr_df=None, species_FRC=None):
    """
    Analyze eigenspecies for two groups, generate networks, and compare them.
    
    Parameters:
    -----------
    g1 : str
        Name of the first group
    g2 : str
        Name of the second group
    base_path : str, optional
        Base path for input/output files
    expr_df : pandas.DataFrame, optional
        Expression dataframe (must be provided)
    species_FRC : object, optional
        Species FRC object (must be provided)
        
    Returns:
    --------
    dict
        Dictionary containing results of the analysis
    """
    try:
        # Set up paths
        prefix = f'{base_path}/{g1}.{g2}'
        
        # Load metadata
        meta_df = pd.read_csv(f'{prefix}.group.tsv', sep='\t')
        
        # Get sample lists
        g1_samples = meta_df[meta_df['group'] == g1]['sample_id'].tolist()
        g2_samples = meta_df[meta_df['group'] == g2]['sample_id'].tolist()
        
        # Check if we have samples in both groups
        if len(g1_samples) == 0 or len(g2_samples) == 0:
            print(f"Warning: Missing samples for {g1} or {g2}. Skipping analysis.")
            return None
        
        # Filter expression data
        all_samples = g1_samples + g2_samples
        filtered_expr_df = expr_df.loc[:, all_samples]
        
        # Calculate eigenspecies
        eigenspecies_results = calculate_eigenspecies_together(
            filtered_expr_df, species_FRC, meta_df, g1_samples, g2_samples, g1, g2
        )
        eigenspecies_df = pd.DataFrame(eigenspecies_results)
        eigenspecies_df.to_csv(f"{prefix}.eigenspecies.csv", sep='\t')
        
        # Create g1 network
        g1_network, g1_sample_cluster_matrix = eigenspecies_correlation_network(
            eigenspecies_df, g1, prefix
        )
        g1_network.to_csv(f"{prefix}.eigenspecies_cor.{g1}.tsv", sep='\t')
        
        # Create g2 network
        g2_network, g2_sample_cluster_matrix = eigenspecies_correlation_network(
            eigenspecies_df, g2, prefix
        )
        g2_network.to_csv(f"{prefix}.eigenspecies_cor.{g2}.tsv", sep='\t')
        
        # Preservation matrix
        preserv_matrix = get_preserv_matrix(g1_network, g2_network)
        preserv_matrix = preserv_matrix.astype(float)
        preserv_matrix.to_csv(f"{prefix}.preserv_matrix.tsv", sep='\t')
        
        # Check if matrix is suitable for clustering
        if preserv_matrix.shape[0] <= 1 or preserv_matrix.shape[1] <= 1:
            print(f"Warning: Preservation matrix for {g1} vs {g2} is too small for clustering. Skipping visualization.")
        else:
            # Verify the matrix contains valid data for clustering
            if preserv_matrix.isnull().values.any() or np.all(preserv_matrix == 0):
                print(f"Warning: Preservation matrix for {g1} vs {g2} contains invalid values. Skipping visualization.")
            else:
                try:
                    # Plot preservation matrix
                    plt.figure(figsize=(12, 10))
                    sns.clustermap(preserv_matrix, annot=True, cmap='YlOrRd')
                    plt.title('Eigenspecies Matrix')
                    plt.savefig(f"{prefix}.preserv_matrix.png")
                except Exception as e:
                    print(f"Error creating clustermap for {g1} vs {g2}: {str(e)}")
        
        # Calculate density
        n = preserv_matrix.shape[0]
        if n > 1:  # Avoid division by zero
            density = (preserv_matrix.sum().sum() - np.trace(preserv_matrix)) / (n * (n - 1))
        else:
            density = 0
        
        print(f"Density of preservation of eigenspecies of {g1} and {g2} is {density}")
        
        # Compare networks
        try:
            results = compare_eigenspecies_networks(g1_sample_cluster_matrix, g2_sample_cluster_matrix)
            results.to_csv(f"{prefix}.compare_eigenspecies_networks.tsv", sep='\t')
        except Exception as e:
            print(f"Error comparing eigenspecies networks for {g1} vs {g2}: {str(e)}")
            results = pd.DataFrame()
        
        # Close all figures to prevent memory issues
        plt.close('all')
        
        # Return all relevant results
        return {
            'eigenspecies_df': eigenspecies_df,
            'g1_network': g1_network,
            'g2_network': g2_network,
            'g1_sample_cluster_matrix': g1_sample_cluster_matrix,
            'g2_sample_cluster_matrix': g2_sample_cluster_matrix,
            'preserv_matrix': preserv_matrix,
            'density': density,
            'comparison_results': results
        }
    
    except Exception as e:
        print(f"Error in analyze_eigenspecies for {g1} vs {g2}: {str(e)}")
        plt.close('all')
        return None

In [None]:
species_FRC = pd.read_csv('../result/GCN_fix_tree/leaves_cluster.tsv', sep='\t')
expr_df = pd.read_csv('../data/Anti/abd.tsv', sep='\t', index_col=0)
expr_df.index = expr_df.index.str.split('|').str[-1]

# Process each comparison
successful_analyses = []
failed_analyses = []

compare_df = pd.read_csv("../data/Anti/Anti.compare.list", sep="\t", header=None, names=['g1', 'g2'])

for _, row in compare_df.iterrows():
    g1, g2 = row['g1'], row['g2']
    
    try:
        result = analyze_eigenspecies(g1=g1, g2=g2, expr_df=expr_df, species_FRC=species_FRC)
        if result is not None:
            successful_analyses.append((g1, g2))
        else:
            failed_analyses.append((g1, g2))
    except Exception as e:
        print(f"Failed to analyze {g1} vs {g2}: {str(e)}")
        failed_analyses.append((g1, g2))
    finally:
        plt.close('all') 

print(f"Successfully analyzed {len(successful_analyses)} comparisons")
print(f"Failed to analyze {len(failed_analyses)} comparisons")

In [None]:
import sys
import os
# check if Rscript exists
python_executable = sys.executable
python_dir = os.path.dirname(python_executable)
rscript_path = os.path.join(python_dir, 'Rscript')
if os.path.exists(rscript_path):
    print(f"Rscript path: {rscript_path}")
else:
    print(f"Could not find Rscript in Python directory: {rscript_path}")
    print("Please ensure R is installed in your conda environment")

In [None]:
#  visualization
import subprocess
compare_df = pd.read_csv("../data/Anti/Anti.compare.list", sep="\t", header=None, names=['g1', 'g2'])

for _, row in compare_df.iterrows():
    g1, g2 = row['g1'], row['g2']
    prefix = f'{g1}.{g2}'
    cmd_args = [rscript_path, "run_preservation.R", prefix]
    try:
        print(f"Executing: {' '.join(cmd_args)}")
        result = subprocess.run(cmd_args, capture_output=True, text=True)
    except Exception as e:
        print(f"Exception occurred while processing: {e}")    
print(f"Done")

