In [23]:
import re
import os
from collections import defaultdict
from scipy.stats import fisher_exact


In [24]:
def load_population(file_path):
    """Load the population file into a set."""
    with open(file_path, 'r') as file:
        return set(line.strip() for line in file)


In [25]:
def load_gene2go(file_path):
    """Load the gene-to-GO mapping file into a dictionary."""
    gene2go = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            gene = parts[0]
            go_terms = parts[1].split(';') if len(parts) > 1 else []
            gene2go[gene] = go_terms
    return gene2go


In [26]:
def load_communities(file_path):
    """Load the graph communities file into a dictionary."""
    communities = {}
    with open(file_path, 'r') as file:
        for line in file:
            match = re.match(r"Community (\d+): (.+)", line.strip())
            if match:
                community_id = int(match.group(1))
                genes = match.group(2).split(',')
                communities[community_id] = genes
    return communities

In [27]:
def perform_enrichment_analysis(community_id, community_genes, population, gene2go, output_file, significant_genes, pval_threshold=0.05):
    """
    Perform Gene Ontology enrichment analysis for a single community.
    Outputs results to a file with p-values and odds ratios.
    Appends gene and GO term data to significant_genes if p-value is below threshold.
    """
    # Calculate GO term frequencies in the population
    go_population_count = defaultdict(int)
    for gene in population:
        for go_term in gene2go.get(gene, []):
            go_population_count[go_term] += 1
    total_population_genes = len(population)

    # Count GO terms in the community
    go_community_count = defaultdict(int)
    for gene in community_genes:
        for go_term in gene2go.get(gene, []):
            go_community_count[go_term] += 1
    total_community_genes = len(community_genes)

    # Perform enrichment analysis for each GO term
    with open(output_file, 'w') as file:
        file.write("GO Term\tGenes in Community\tGenes in Population\tp-value\tOdds Ratio\n")
        for go_term, count_in_community in go_community_count.items():
            count_in_population = go_population_count.get(go_term, 0)
            count_not_in_community = total_community_genes - count_in_community
            count_not_in_population = total_population_genes - count_in_population

            # Skip GO terms with insufficient data
            if count_in_population == 0 or count_not_in_population == 0:
                continue

            # Build the contingency table
            contingency_table = [
                [count_in_community, count_in_population],
                [count_not_in_community, count_not_in_population]
            ]

            # Perform Fisher's exact test
            _, p_value = fisher_exact(contingency_table, alternative='greater')

            # Calculate odds ratio
            odds_ratio = (
                (count_in_community * count_not_in_population) /
                (count_not_in_community * count_in_population)
            ) if count_not_in_community > 0 else 0

            # Write results for this GO term
            file.write(f"{go_term}\t{count_in_community}\t{count_in_population}\t"
                       f"{p_value:.6f}\t{odds_ratio:.4f}\n")

            # Record significant genes for the summary file (if p-value is below threshold)
            if p_value <= pval_threshold:
                for gene in community_genes:
                    if go_term in gene2go.get(gene, []):
                        significant_genes[community_id].add(gene)

In [28]:
def save_significant_genes_per_community(significant_genes, output_file):
    """
    Save all significant genes for each community in a single text file.
    """
    with open(output_file, 'w') as file:
        for community_id, genes in significant_genes.items():
            # Write community ID and the list of significant genes
            file.write(f"community_{community_id}: {' '.join(genes)}\n")
    print(f"Significant genes saved in {output_file}")


In [29]:
def enrichment_analysis_per_community(population_file, gene2go_file, communities_file, output_dir, pval_threshold=0.05):
    """
    Perform enrichment analysis for all communities, generating individual output files per community.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load input data
    population = load_population(population_file)
    gene2go = load_gene2go(gene2go_file)
    communities = load_communities(communities_file)

    # Dictionary to hold significant genes for each community (using sets to avoid duplicates)
    significant_genes = defaultdict(set)

    # Process each community
    for community_id, community_genes in communities.items():
        # Filter genes in the community by the population
        community_genes = [gene for gene in community_genes if gene in population]
        if not community_genes:
            continue

        # Output file for this community
        output_file = os.path.join(output_dir, f"community_{community_id}_enrichment.tsv")

        # Perform enrichment analysis
        perform_enrichment_analysis(community_id, community_genes, population, gene2go, output_file, significant_genes, pval_threshold)

    return significant_genes



## Graph Communities Enrichment

In [31]:
def main(population_file, gene2go_file, communities_file, individual_output_dir, summary_file, pval_threshold=0.05):
    """
    Main function to perform enrichment analysis and save results.
    """
    # Perform enrichment analysis and gather significant genes
    significant_genes = enrichment_analysis_per_community(population_file, gene2go_file, communities_file, individual_output_dir, pval_threshold)

    # Save significant genes in the required format
    save_significant_genes_per_community(significant_genes, summary_file)

    print(f"Enrichment analysis completed. Summary saved in '{summary_file}'.")

# file paths
population_file = "Population.txt"
gene2go_file = "gene2golist2.txt"
communities_file = "graph_communities.txt"
individual_output_dir = "2_Graph_enrichment_results"
summary_file = "2_significant_graph_genes_summary_0.05.txt"


In [32]:
# Run the main function with inputs
main(population_file, gene2go_file, communities_file, individual_output_dir, summary_file, pval_threshold=0.05)

Significant genes saved in 2_significant_graph_genes_summary_0.05.txt
Enrichment analysis completed. Summary saved in '2_significant_graph_genes_summary_0.05.txt'.


## HyperGraph Communities Enrichment

In [49]:
def main(population_file, gene2go_file, communities_file, individual_output_dir, summary_file, pval_threshold=0.05):
    """
    Main function to perform enrichment analysis and save results.
    """
    # Perform enrichment analysis and gather significant genes
    significant_genes = enrichment_analysis_per_community(population_file, gene2go_file, communities_file, individual_output_dir, pval_threshold)

    # Save significant genes in the required format
    save_significant_genes_per_community(significant_genes, summary_file)

    print(f"Enrichment analysis completed. Summary saved in '{summary_file}'.")

# file paths
population_file = "Population.txt"
gene2go_file = "gene2golist2.txt"
communities_file = "hypergraph_communities.txt"
individual_output_dir = "2_HyperGraph_enrichment_results"
summary_file = "2_significant_Hypergraph_genes_summary_0.05.txt"


In [51]:
# Run the main function with inputs
main(population_file, gene2go_file, communities_file, individual_output_dir, summary_file, pval_threshold=0.05)

Significant genes saved in 2_significant_Hypergraph_genes_summary_0.05.txt
Enrichment analysis completed. Summary saved in '2_significant_Hypergraph_genes_summary_0.05.txt'.
