Eigenspecies analysis of antibiotic cohort
- prepare group file for comparison pairs, two groups in one comparison
- calculate eigenspecies of all FRCs in all samples in two groups
- construct eigenspecies correlation network for two groups respectively
- preservation matrix of correlation matrices between two groups
- compare eigenspecies networks difference between two groups


In [1]:
import pandas as pd
import os
import numpy as np


# Create group file for every comparison
os.makedirs("../result/Anti/eigenspecies", exist_ok=True)

anti_df = pd.read_csv("../data/Anti/Anti.group.tsv", sep="\t")

all_dict = {}
for _, row in anti_df.iterrows():
    disease_day = row['disease_day']
    ecc_day = row['ECC_day']
    sample_id = row['sample_id']
    
    if disease_day not in all_dict:
        all_dict[disease_day] = {}
    all_dict[disease_day][sample_id] = 1
    
    if pd.notna(ecc_day):
        if ecc_day not in all_dict:
            all_dict[ecc_day] = {}
        all_dict[ecc_day][sample_id] = 1

compare_df = pd.read_csv("../data/Anti/Anti.compare.list", sep="\t", header=None, names=['g1', 'g2'])

# Process each comparison
for _, row in compare_df.iterrows():
    g1, g2 = row['g1'], row['g2']
    
    # Create output dataframe
    output_data = []
    
    # Process each group
    for g in [g1, g2]:
        # Get samples for this group
        if g in all_dict:
            samples = sorted(all_dict[g].keys())
            
            # Add each sample to output data
            for sample in samples:
                output_data.append({
                    'sample_id': sample,
                    'group': g
                })
    
    # Create and save the output dataframe
    output_df = pd.DataFrame(output_data)
    output_path = f"../result/Anti/eigenspecies/{g1}.{g2}.group.tsv"
    output_df.to_csv(output_path, sep="\t", index=False)
    
    print(f"Created file: {output_path} with {len(output_df)} samples")

Created file: ../result/Anti/eigenspecies/Health_0.Health_7.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/Health_0.Health_90.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/Health_7.Health_90.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/Exposed_0.Exposed_7.group.tsv with 36 samples
Created file: ../result/Anti/eigenspecies/Exposed_0.Exposed_90.group.tsv with 36 samples
Created file: ../result/Anti/eigenspecies/Exposed_7.Exposed_90.group.tsv with 36 samples
Created file: ../result/Anti/eigenspecies/EB_0.EB_7.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/EB_0.EB_90.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/EB_7.EB_90.group.tsv with 12 samples
Created file: ../result/Anti/eigenspecies/EN_0.EN_7.group.tsv with 24 samples
Created file: ../result/Anti/eigenspecies/EN_0.EN_90.group.tsv with 24 samples
Created file: ../result/Anti/eigenspecies/EN_7.EN_90.group.tsv with 24 samples
Cr

In [2]:
from eigenspecies_utils import calculate_eigenspecies, eigenspecies_correlation_network, get_preserv_matrix, compare_eigenspecies_networks, calculate_eigenspecies_together


# load data

species_FRC = pd.read_csv('../result/GCN_fix_tree/leaves_cluster.tsv', sep='\t')
expr_df = pd.read_csv('../data/Anti/abd.tsv', sep='\t', index_col=0)
expr_df.index = expr_df.index.str.split('|').str[-1]

g1 = 'Exposed_0'
g2 = 'Exposed_90'
prefix = f'../result/Anti/eigenspecies/{g1}.{g2}'
meta_df = pd.read_csv(f'{prefix}.group.tsv', sep='\t')



In [3]:



g1_samples = meta_df[meta_df['group'] == g1]['sample_id'].tolist()
g2_samples = meta_df[meta_df['group'] == g2]['sample_id'].tolist()


g1_expr = expr_df.loc[:, g1_samples]
g2_expr = expr_df.loc[:, g2_samples]
# compute eigenspecies
eigenspecies_results = []
eigenspecies_results.extend(calculate_eigenspecies(g1_expr, species_FRC, g1))
eigenspecies_results.extend(calculate_eigenspecies(g2_expr, species_FRC, g2))

# create eigenspecies DataFrame
eigenspecies_df = pd.DataFrame(eigenspecies_results)
eigenspecies_df.to_csv(f"{prefix}.eigenspecies.csv", sep='\t')


In [4]:



g1_samples = meta_df[meta_df['group'] == g1]['sample_id'].tolist()
g2_samples = meta_df[meta_df['group'] == g2]['sample_id'].tolist()

all_samples = g1_samples + g2_samples


expr_df = expr_df.loc[:, all_samples]

eigenspecies_results = calculate_eigenspecies_together(expr_df, species_FRC, meta_df, g1_samples, g2_samples, g1, g2)
eigenspecies_df = pd.DataFrame(eigenspecies_results)
eigenspecies_df.to_csv(f"{prefix}.together.eigenspecies.csv", sep='\t')


In [5]:
# Create health network
g1_network, g1_sample_cluster_matrix = eigenspecies_correlation_network(
    eigenspecies_df, g1, prefix
)

# Create disease network
g2_network, g2_sample_cluster_matrix = eigenspecies_correlation_network(
    eigenspecies_df, g2, prefix
)

In [6]:
import seaborn as sns
from matplotlib import pyplot as plt

# compute Preserv(1,2) matrix for disease and control group
preserv_matrix = get_preserv_matrix(g1_network, g2_network)

preserv_matrix = preserv_matrix.astype(float)
preserv_matrix.to_csv("{}.preserv_matrix.tsv".format(prefix),sep='\t')

plt.figure(figsize=(12, 10))
sns.clustermap(preserv_matrix, annot=True, cmap='YlOrRd')
plt.title('Eigenspecies Matrix')
plt.savefig("{}.preserv_matrix.png".format(prefix))

n = preserv_matrix.shape[0]
if n > 1:  # Avoid division by zero
    # Exclude diagonal elements (self-connections) from calculation
    density = (preserv_matrix.sum().sum() - np.trace(preserv_matrix)) / (n * (n - 1))
else:
    density = 0

print(density) 

0.8011442823528266


In [7]:

results = compare_eigenspecies_networks(g1_sample_cluster_matrix,g2_sample_cluster_matrix)
results

results.to_csv("{}.compare_eigenspecies_networks.tsv".format(prefix), sep='\t')