#### Detect FRC/supercluster enriched/depleted KOs

In [1]:
import pandas as pd
gcn_df = pd.read_csv('../data/gcn2008.tsv', sep='\t', index_col=0)
cluster_df = pd.read_csv('../result/GCN_fix_tree/leaves_cluster.tsv', sep='\t', index_col=0, header=0)
cluster_df.fillna('NA', inplace=True)
outdir = '../result/GCN_fix_tree'
interested_cluster = 'S1_C8'


In [2]:
interested_species = cluster_df.index[
    (cluster_df['cluster'] == interested_cluster) | 
    (cluster_df['supercluster'] == interested_cluster)
].tolist()

interested_species_count = len(interested_species)
print(f"{interested_cluster} contains interested_species_count species.")


S1_C8 contains interested_species_count species.


In [3]:
# Count the total number of species
total_species_count = gcn_df.shape[0]

# Calculate KO occurrence statistics for these species in gcn2008.tsv
column_names = {
    'KO': 'KO',
    'interested_present': f'{interested_cluster} Present', 
    'interested_occurrence': f'{interested_cluster} Occurrence', 
    'all_present': 'All Present', 
    'all_occurrence': 'All Occurrence'
}

kos_df1 = pd.DataFrame(columns=list(column_names.values()))

for ko in gcn_df.columns:
    # Calculate presence rate and occurrence rate in the cluster of interest
    ko_interested_present = (gcn_df.loc[interested_species, ko] > 0).sum() / interested_species_count
    ko_interested_counts = gcn_df.loc[interested_species, ko].sum()
    ko_interested_occurrence = ko_interested_counts / interested_species_count
    
    # Calculate presence rate and occurrence rate across all species
    ko_all_present = (gcn_df[ko] > 0).sum() / total_species_count
    ko_all_occurrence = gcn_df[ko].sum() / total_species_count
    
    # Create a new row
    new_row = {
        column_names['KO']: ko,
        column_names['interested_present']: ko_interested_present,
        column_names['interested_occurrence']: ko_interested_occurrence,
        column_names['all_present']: ko_all_present,
        column_names['all_occurrence']: ko_all_occurrence
    }
    
    # Add to DataFrame
    kos_df1 = pd.concat([kos_df1, pd.DataFrame([new_row])], ignore_index=True)

# Sort by occurrence rate in the cluster of interest (descending)
kos_df1 = kos_df1.sort_values(by=column_names['interested_occurrence'], ascending=False)

# Output to file
output_file1 = f'{outdir}/{interested_cluster}.kos_summary.tsv'
kos_df1.to_csv(output_file1, sep='\t', index=False)
print(f"KO occurrence statistics saved to: {output_file1}")


KO occurrence statistics saved to: ../result/GCN_fix_tree/S1_C8.kos_summary.tsv


In [4]:
# Fisher's exact test analysis
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import fdrcorrection

# Create result DataFrame
column_names2 = {
    'KO': 'KO',
    'interested_present': f'{interested_cluster} Present',
    'interested_absent': f'{interested_cluster} Absent',
    'non_interested_present': f'Non {interested_cluster} Present',
    'non_interested_absent': f'Non {interested_cluster} Absent',
    'odds_ratio': 'Odds Ratio',
    'pvalue': 'P-value',
    'adjusted_pvalue': 'Adjusted P-value'
}

kos_df2 = pd.DataFrame(columns=list(column_names2.values()))
pvalues = []

for ko in gcn_df.columns:
    # Count species in the cluster of interest with and without this KO
    ko_interested_present = (gcn_df.loc[interested_species, ko] > 0).sum()
    ko_interested_absent = interested_species_count - ko_interested_present
    
    # Count species not in the cluster of interest with and without this KO
    ko_non_interested_present = (gcn_df.loc[~gcn_df.index.isin(interested_species), ko] > 0).sum()
    ko_non_interested_absent = total_species_count - interested_species_count - ko_non_interested_present
    
    # Perform Fisher's exact test
    contingency_table = [
        [ko_interested_present, ko_interested_absent], 
        [ko_non_interested_present, ko_non_interested_absent]
    ]
    oddsratio, pvalue = fisher_exact(contingency_table, alternative='two-sided')
    pvalues.append(pvalue)
    
    # Create a new row
    new_row = {
        column_names2['KO']: ko,
        column_names2['interested_present']: ko_interested_present,
        column_names2['interested_absent']: ko_interested_absent,
        column_names2['non_interested_present']: ko_non_interested_present,
        column_names2['non_interested_absent']: ko_non_interested_absent,
        column_names2['odds_ratio']: oddsratio,
        column_names2['pvalue']: pvalue
    }
    
    # Add to DataFrame
    kos_df2 = pd.concat([kos_df2, pd.DataFrame([new_row])], ignore_index=True)

# Apply FDR correction to p-values
_, adjusted_pvalues = fdrcorrection(pvalues, alpha=0.05, method='indep')
kos_df2[column_names2['adjusted_pvalue']] = adjusted_pvalues

# Sort by adjusted p-value (ascending)
kos_df2 = kos_df2.sort_values(by=column_names2['adjusted_pvalue'], ascending=True)

# Output to file

output_file2 = f'{outdir}/{interested_cluster}.kos_fisher.tsv'
kos_df2.to_csv(output_file2, sep='\t', index=False)
print(f"Fisher's exact test results saved to: {output_file2}")


Fisher's exact test results saved to: ../result/GCN_fix_tree/S1_C8.kos_fisher.tsv


In [5]:
# Identify significantly enriched and depleted KOs
enriched_kos = kos_df2[(kos_df2[column_names2['adjusted_pvalue']] < 0.05) & 
                       (kos_df2[column_names2['odds_ratio']] > 2)]

depleted_kos = kos_df2[(kos_df2[column_names2['adjusted_pvalue']] < 0.05) & 
                       (kos_df2[column_names2['odds_ratio']] < 0.5)]  # 0.5 = 1/2

# Print comprehensive statistics summary
print(f"\nAnalysis Summary:")
print(f"After statistical filtering (adjusted p-value < 0.05, odds ratio > 2 or < 0.5):")
print(f"- Found {len(enriched_kos)} significantly enriched KOs (odds ratio > 2)")
print(f"- Found {len(depleted_kos)} significantly depleted KOs (odds ratio < 0.5)")
print(f"- Total {len(enriched_kos) + len(depleted_kos)} differentially abundant KOs")


Analysis Summary:
After statistical filtering (adjusted p-value < 0.05, odds ratio > 2 or < 0.5):
- Found 281 significantly enriched KOs (odds ratio > 2)
- Found 1388 significantly depleted KOs (odds ratio < 0.5)
- Total 1669 differentially abundant KOs
