Correlation between taxonomic diversity and eigenspecies of EN0 and EB0 samples

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Load simpson diversity of Antibiotic cohort
# doi: 10.1038/ismej.2015.148 Supptable1
alpha_diversity = pd.read_csv('../data/Anti/Antibiotic.diversity.Frederic.tsv', sep='\t', index_col=0)
alpha_diversity = alpha_diversity.reset_index()
alpha_diversity = alpha_diversity.rename(columns={'index': 'sample_id'})

# Load eigenspecis of EN0 and EB0 group
eigenspecies_df = pd.read_csv('../result/Anti/eigenspecies/EB_0.EN_0.eigenspecies.csv',index_col=0,header=0,sep="\t")


In [2]:
eigenspecies_df

Unnamed: 0,sample,group,cluster,eigenspecies
0,P11E0,EB_0,S1_C1,-0.089232
1,P12E0,EB_0,S1_C1,0.876658
2,P17E0,EB_0,S1_C1,-0.252746
3,P18E0,EB_0,S1_C1,0.375255
4,P20E0,EB_0,S1_C1,-0.347970
...,...,...,...,...
373,P2E0,EN_0,S6_C3,-0.069362
374,P3E0,EN_0,S6_C3,-0.051200
375,P4E0,EN_0,S6_C3,-0.042707
376,P5E0,EN_0,S6_C3,-0.078305


In [3]:
# reorganize diversity dataframe with timepoint

alpha_diversity['individual'] = alpha_diversity['sample_id'].str.extract(r'(P\d+)')[0]
alpha_diversity['timepoint'] = alpha_diversity['sample_id'].str.extract(r'(E\d+)')[0]
exposed_diversity = alpha_diversity[alpha_diversity['timepoint'].isin(['E0', 'E7', 'E90'])]

e0_samples = exposed_diversity[exposed_diversity['timepoint'] == 'E0']

new_columns = []
for time in ['E0', 'E7', 'E90']:
    for level in ['family', 'genus', 'species']:
        new_columns.append(f"{time}_Diversity_{level[0].upper()}")

reorganized_data = pd.DataFrame(columns=['sample_id'] + new_columns)
individuals = exposed_diversity['individual'].unique()

for individual in individuals:
    new_row = {'sample_id': individual}
    individual_samples = exposed_diversity[exposed_diversity['individual'] == individual]

    for time in ['E0', 'E7', 'E90']:
        time_samples = individual_samples[individual_samples['timepoint'] == time]
        
        if not time_samples.empty:
            for level, level_abbr in zip(['family', 'genus', 'species'], ['F', 'G', 'S']):
                column_name = f"{time}_Diversity_{level_abbr}"
                diversity_column = f"diversity_{level}"
                # Assign the diversity value to the new row
                new_row[column_name] = time_samples[diversity_column].values[0]
    
    reorganized_data = pd.concat([reorganized_data, pd.DataFrame([new_row])], ignore_index=True)
    
reorganized_data['sample_id'] = reorganized_data['sample_id'] + 'E0'
reorganized_data.set_index('sample_id', inplace=True)
selected_metadata_df = reorganized_data

In [4]:
selected_metadata_df

Unnamed: 0_level_0,E0_Diversity_F,E0_Diversity_G,E0_Diversity_S,E7_Diversity_F,E7_Diversity_G,E7_Diversity_S,E90_Diversity_F,E90_Diversity_G,E90_Diversity_S
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P10E0,0.532544,0.49719,0.945785,0.396756,0.39213,0.94668,0.473491,0.467002,0.946319
P11E0,0.372712,0.365698,0.891092,0.575924,0.57234,0.907796,0.315245,0.307464,0.952618
P12E0,0.337907,0.331985,0.756375,0.570061,0.560863,0.483727,0.422769,0.417761,0.715459
P13E0,0.563132,0.562779,0.916413,0.301407,0.298947,0.792471,0.417627,0.414962,0.898541
P14E0,0.714458,0.717341,0.799351,0.700549,0.705697,0.882701,0.758534,0.762123,0.87884
P15E0,0.329417,0.323401,0.911315,0.128132,0.125824,0.833754,0.210885,0.20635,0.914693
P17E0,0.585333,0.58344,0.875106,0.376184,0.370515,0.950578,0.465318,0.464296,0.913252
P18E0,0.529631,0.529447,0.934943,0.211066,0.195079,0.766424,0.305622,0.305254,0.92394
P19E0,0.831092,0.837258,0.931509,0.844488,0.8284,0.93982,0.727317,0.733452,0.790334
P1E0,0.769141,0.775785,0.949256,0.728418,0.731629,0.941198,0.573098,0.568159,0.909619


In [5]:
# Correlation between eigenspecies and diversity at timepoint E0,E7,90


results = []
for cluster in eigenspecies_df['cluster'].unique():
    cluster_df = eigenspecies_df[eigenspecies_df['cluster'] == cluster]

    # Merge eigenspecies data with metadata
    merged_df = cluster_df.merge(selected_metadata_df, left_on='sample', right_on='sample_id', how='inner')
    
    for col in selected_metadata_df.columns.tolist():
        # Remove rows with NA values
        valid_rows = merged_df[[col, 'eigenspecies']].dropna()

        # Compute Pearson correlation coefficient and p-value
        corr, p_value = pearsonr(valid_rows[col], valid_rows['eigenspecies'])

        # Record results
        results.append({
            'cluster': cluster,
            'column': col,
            'correlation': corr,
            'p_value': p_value
        })

# Save the results to a CSV file
result_df = pd.DataFrame(results)

In [6]:
result_df

Unnamed: 0,cluster,column,correlation,p_value
0,S1_C1,E0_Diversity_F,0.018690,0.941322
1,S1_C1,E0_Diversity_G,-0.023363,0.926685
2,S1_C1,E0_Diversity_S,0.138732,0.583004
3,S1_C1,E7_Diversity_F,0.033525,0.894937
4,S1_C1,E7_Diversity_G,0.024048,0.924541
...,...,...,...,...
184,S6_C3,E7_Diversity_G,0.087482,0.729966
185,S6_C3,E7_Diversity_S,-0.106534,0.673945
186,S6_C3,E90_Diversity_F,-0.398957,0.100995
187,S6_C3,E90_Diversity_G,-0.394216,0.105492


In [7]:
result_df[result_df['p_value'] < 0.05]

Unnamed: 0,cluster,column,correlation,p_value
9,S1_C10,E0_Diversity_F,0.489313,0.03931
10,S1_C10,E0_Diversity_G,0.47571,0.045998
12,S1_C10,E7_Diversity_F,0.489553,0.039199
13,S1_C10,E7_Diversity_G,0.480382,0.043611
15,S1_C10,E90_Diversity_F,0.560632,0.015511
16,S1_C10,E90_Diversity_G,0.565501,0.01445
21,S1_C14,E7_Diversity_F,-0.542335,0.020058
22,S1_C14,E7_Diversity_G,-0.538667,0.021085
35,S1_C15,E90_Diversity_S,0.529531,0.023821
54,S1_C2,E0_Diversity_F,-0.468921,0.049643
