In [1]:
# Import required libraries
from src.corr import CorrNetworkGraph
import pandas as pd

In [2]:
# Define file paths
asv_path = "./data/top_percent_asv.csv"
amino_acid_path = "./data/AAs.csv"
tax_path = "./data/Tax.csv"

# Load CSV files into DataFrames
asv = pd.read_csv(asv_path, index_col=0)
amino_acid = pd.read_csv(amino_acid_path, index_col=0)
tax = pd.read_csv(tax_path, index_col=0)

In [3]:
# Remove columns with all zero values in both ASV and amino acid matrices
asv = asv.loc[:, asv.apply(sum, axis=0) != 0]
amino_acid = amino_acid.loc[:, amino_acid.apply(sum, axis=0) != 0]

The CorrNetworkGraph class allows you to compute the correlation between ASVs and amino acids.
By default, it uses the Pearson correlation method.
You can specify the correlation method using the method parameter:

"pearson" – Pearson correlation (default)

"spearman" – Spearman rank correlation

In [4]:
# Initialize correlation network using Spearman correlation
cng = CorrNetworkGraph(asv, amino_acid, method="spearman")
# Extract edges from the correlation network
edge = cng.edge()
# Add genus information by parsing taxonomy strings
edge['Genus'] = [tax.loc[i, 'taxonomy'].split(";")[-2][4:] for i in edge['Source']]
edge

Unnamed: 0,Source,Target,Weight,Relevance,Significance,Color,Genus
0,B_ASV_24271,Trp,0.744330,0.744330,4.866774e-11,red,Weissella
1,B_ASV_48893,Asp,0.657012,0.657012,3.808966e-08,red,Virgibacillus
2,B_ASV_48893,Pro,0.604009,0.604009,8.282499e-07,red,Virgibacillus
3,B_ASV_48893,Phe,0.614922,0.614922,4.601117e-07,red,Virgibacillus
4,B_ASV_48893,Val,0.609956,0.609956,6.028812e-07,red,Virgibacillus
...,...,...,...,...,...,...,...
1231,F_ASV_705,Leu,0.734988,-0.734988,1.122650e-10,blue,unidentified
1232,F_ASV_705,Thr,0.722886,-0.722886,3.151782e-10,blue,unidentified
1233,F_ASV_705,Ser,0.719605,-0.719605,4.131144e-10,blue,unidentified
1234,F_ASV_705,Tyr,0.738057,-0.738057,8.564226e-11,blue,unidentified


In [5]:
edge['Source'].unique().shape

(162,)

In [6]:
# Filter edges for a specific ASV
edge.loc[edge['Source'] == 'B_ASV_143173', :].head()

Unnamed: 0,Source,Target,Weight,Relevance,Significance,Color,Genus
347,B_ASV_143173,Asp,0.661728,0.661728,2.811175e-08,red,Bacillus
348,B_ASV_143173,Met,0.70808,0.70808,1.037773e-09,red,Bacillus
349,B_ASV_143173,Phe,0.654301,0.654301,4.524482e-08,red,Bacillus
350,B_ASV_143173,Val,0.669707,0.669707,1.660924e-08,red,Bacillus
351,B_ASV_143173,His,0.679234,0.679234,8.67269e-09,red,Bacillus


In [7]:
# Count how often each ASV appears as a source
# Numbers represent the number of AAs that have strong correlation with modified ASV
edge['Source'].value_counts()

B_ASV_16357     17
B_ASV_15625     16
B_ASV_95119     16
B_ASV_30109     16
B_ASV_25067     16
                ..
F_ASV_3180       1
B_ASV_15337      1
B_ASV_11157      1
B_ASV_112542     1
B_ASV_24271      1
Name: Source, Length: 162, dtype: int64

The following analysis counts how often each amino acid appears as a target in the correlation network.
This reflects the number of ASVs that have a strong correlation with each modified amino acid.

The results show that some amino acids, such as Isoleucine (Ile) and Leucine (Leu), are highly connected, while others like Cysteine (Cys) show no strong correlation with any ASV.

In [8]:
# Count how often each amino acid appears as a target
# Numbers represent the number of ASVs with strong correlation with modified amino acids
edge['Target'].value_counts()

Ile    102
Leu    102
Val     97
Phe     95
Ser     91
Asp     91
Ala     91
Thr     90
Tyr     83
Pro     76
Glu     74
Trp     69
Asn     48
Met     47
Gly     32
His     27
Lys     10
Arg      6
Gln      5
Name: Target, dtype: int64

In [9]:
# Get all red (positive correlation) edges targeting "Ile"
edge.loc[(edge['Target'] == 'Trp') & (edge['Color'] == 'red'), :]

Unnamed: 0,Source,Target,Weight,Relevance,Significance,Color,Genus
0,B_ASV_24271,Trp,0.74433,0.74433,4.866774e-11,red,Weissella
98,B_ASV_143697,Trp,0.74282,0.74282,5.584394e-11,red,Lactococcus
115,B_ASV_128182,Trp,0.67546,0.67546,1.125091e-08,red,Weissella
118,B_ASV_84949,Trp,0.705889,0.705889,1.230164e-09,red,Pantoea
119,B_ASV_97128,Trp,0.615625,0.615625,4.426734e-07,red,Weissella
160,B_ASV_104881,Trp,0.600301,0.600301,1.006351e-06,red,Weissella
242,B_ASV_108878,Trp,0.661392,0.661392,2.873193e-08,red,Pantoea
259,B_ASV_51435,Trp,0.61431,0.61431,4.758023e-07,red,Lactococcus
289,B_ASV_107276,Trp,0.631793,0.631793,1.772549e-07,red,Pantoea
291,B_ASV_105111,Trp,0.76009,0.76009,1.091701e-11,red,Empedobacter


In [10]:
# Genus frequency across all edges
edge['Genus'].value_counts()

unclassified_Fungi                 335
unidentified                       172
Bacillus                           107
Weissella                           94
Pantoea                             83
Saccharopolyspora                   35
unclassified_Enterobacteriaceae     30
Virgibacillus                       29
Pseudomonas                         28
Lactobacillus                       27
Clostridium_sensu_stricto_1         26
Lactococcus                         24
Aquabacterium                       21
Caulobacter                         17
Pseudonocardiaceae                  16
Pediococcus                         16
Asticcacaulis                       15
Thermoascus                         13
Zoogloea                            13
Nesterenkonia                       12
Aliihoeflea                         12
Halomonas                           12
Terrisporobacter                    12
Streptococcus                       11
Aspergillus                         10
Saccharomonospora        

In [11]:
edge.loc[edge['Genus'] == 'Bacillus', 'Source'].unique().shape

(15,)

In [12]:
# Genus distribution for positive correlations (red edges)
edge.loc[edge['Color'] == 'red', 'Genus'].value_counts()

Bacillus                         107
Saccharopolyspora                 35
Virgibacillus                     29
Lactobacillus                     27
Clostridium_sensu_stricto_1       26
unclassified_Fungi                24
Aquabacterium                     21
Caulobacter                       17
Pediococcus                       16
Pseudonocardiaceae                16
Asticcacaulis                     15
Zoogloea                          13
unidentified                      13
Terrisporobacter                  12
Nesterenkonia                     12
Aliihoeflea                       12
Halomonas                         12
Streptococcus                     11
Saccharomonospora                 10
Aspergillus                       10
Pseudomonas                        9
Enterococcus                       8
Weissella                          8
Aeromonas                          7
Streptomyces                       5
Romboutsia                         4
Serratia                           4
P

In [13]:
# Genus distribution for negative correlations (blue edges)
edge.loc[edge['Color'] == 'blue', 'Genus'].value_counts()

unclassified_Fungi                 311
unidentified                       159
Weissella                           86
Pantoea                             80
unclassified_Enterobacteriaceae     30
Lactococcus                         22
Pseudomonas                         19
Thermoascus                         13
Kroppenstedtia                       2
unclassified_Bacillaceae             2
Acinetobacter                        2
unclassified_Lactobacillales         2
Scopulibacillus                      1
Byssochlamys                         1
Name: Genus, dtype: int64

In [14]:
# Select edges where Genus is among target genera and count ASVs
target_genera = ['Bacillus', 'Saccharopolyspora', 'Virgibacillus', 'Lactobacillus']
target_asv = edge.loc[edge['Genus'].isin(target_genera), 'Source'].value_counts()
target_asv

B_ASV_15625     16
B_ASV_95119     16
B_ASV_143173    15
B_ASV_131779    14
B_ASV_135906    11
B_ASV_78240     11
B_ASV_137520    11
B_ASV_19290     11
B_ASV_125122    10
B_ASV_48893     10
B_ASV_83738     10
B_ASV_35282      9
B_ASV_98229      9
B_ASV_77896      8
B_ASV_102823     7
B_ASV_70040      6
B_ASV_68290      5
B_ASV_38043      4
B_ASV_79875      3
B_ASV_129877     3
B_ASV_17952      2
B_ASV_140470     2
B_ASV_100340     2
B_ASV_59582      2
B_ASV_142812     1
Name: Source, dtype: int64

In [15]:
# Return sorted list of target ASVs in reverse order
sorted(target_asv.index, reverse=True)

['B_ASV_98229',
 'B_ASV_95119',
 'B_ASV_83738',
 'B_ASV_79875',
 'B_ASV_78240',
 'B_ASV_77896',
 'B_ASV_70040',
 'B_ASV_68290',
 'B_ASV_59582',
 'B_ASV_48893',
 'B_ASV_38043',
 'B_ASV_35282',
 'B_ASV_19290',
 'B_ASV_17952',
 'B_ASV_15625',
 'B_ASV_143173',
 'B_ASV_142812',
 'B_ASV_140470',
 'B_ASV_137520',
 'B_ASV_135906',
 'B_ASV_131779',
 'B_ASV_129877',
 'B_ASV_125122',
 'B_ASV_102823',
 'B_ASV_100340']