This script check the enrichment of module completeness in each cluster/supercluster.

If rscript fails, check if effsize installed in your R.

In [1]:
# load cluster annotation
import os
import pandas as pd
import sys
import subprocess

outdir = '../result/signature_modules'
cluster_df = pd.read_csv('../result/GCN_fix_tree/leaves_cluster.tsv', sep='\t', index_col=0, header=0)
cluster_df.fillna('NA', inplace=True)


In [None]:
# extract all superclusters' completeness
sclusters = set(cluster_df['supercluster'])
super_d = pd.DataFrame(columns=['scluster', 'sp'])
for sp in cluster_df.index:
    super_d.loc[super_d.shape[0], ] = [cluster_df.loc[sp, 'supercluster'], sp.replace('-', '_')]
super_d.to_csv(os.path.join(outdir, 'super_species.tsv'), sep='\t', index=False, header=False)

In [9]:
# extract all subclusters' completeness under each supercluster
completeness_d = pd.read_csv(os.path.join(outdir, 'genome_module.completeness.tsv'), sep='\t', header=0, index_col=0)
for scluter in sclusters:
    if scluter == "NA" or scluter.startswith("C"):
        continue
    species = cluster_df[cluster_df['supercluster'] == scluter].index
    opath = os.path.join(outdir, scluter + '.cluster_species.tsv')
    c_df = pd.DataFrame(columns=['cluster', 'sp'])
    for sp in cluster_df[cluster_df['supercluster'] == scluter].index:
        c_df.loc[c_df.shape[0], ] = [cluster_df.loc[sp, 'cluster'].replace('-', '_'), sp.replace('-', '_')]
    sp_list = list(c_df['sp'])
    part_completeness = completeness_d.loc[sp_list, :]
    part_completeness.to_csv(os.path.join(outdir, scluter + '.genome_module.completeness.tsv'), sep='\t', index=True, header=True)
    c_df.to_csv(opath, sep='\t', index=False, header=False)
    

In [None]:
# check if Rscript exists
python_executable = sys.executable
python_dir = os.path.dirname(python_executable)
rscript_path = os.path.join(python_dir, 'Rscript')
if os.path.exists(rscript_path):
    print(f"Rscript path: {rscript_path}")
else:
    print(f"Could not find Rscript in Python directory: {rscript_path}")
    print("Please ensure R is installed in your conda environment")

In [None]:
# Supercluster completeness testing
cmd_args = [
    rscript_path, 
    "cluster_completeness_testing.R",
    "genome_module.completeness.tsv", 
    "super_species.tsv", 
    "super_cluster.module_comp.wilcox.testing.tsv", 
    outdir
]
try:
    print(f"Executing: {' '.join(cmd_args)}")
    result = subprocess.run(cmd_args, capture_output=True, text=True)
except Exception as e:
    print(f"Exception occurred while processing: {e}")    
print(f"Done")

In [None]:
# FRC completeness testing in each supercluster
if os.path.exists(rscript_path):
    for scluster in sclusters:
        print(f"Processing {scluster}...")
        try:
            cmd_args = [
                rscript_path,
                "cluster_completeness_testing.R",
                f"{scluster}.genome_module.completeness.tsv",
                f"{scluster}.cluster_species.tsv",
                f"{scluster}.cluster.module_comp.wilcox.testing.tsv",
                outdir
            ]           
            print(f"Executing: {' '.join(cmd_args)}")
            result = subprocess.run(cmd_args, capture_output=True, text=True)
        except Exception as e:
            print(f"Exception occurred while processing {scluster}: {e}")     
else:
    print(f"Could not find Rscript at {rscript_path}")
    print("Please ensure R is installed in your conda environment")

In [None]:
import glob
import os

# Read module name
name = {}
with open("../data/module_name.tsv", "r") as LS:
    for line in LS:
        line = line.strip()
        l = line.split("\t")
        name[l[0]] = l[1]

# Get all files matching the pattern
files = glob.glob(f"{outdir}/*cluster.module_comp.wilcox.testing.tsv")
check = {}
sig = {}
n = {}
p_order = {}
record = {}
count = {}
select = {}

# Process each file
for file in files:
    with open(file, "r") as IN:
        p = {}
        info = {}
        # Skip header line
        next(IN)
        
        # Determine threshold based on filename
        if file.startswith("super"):
            cutoff = 0.8
        else:
            cutoff = 0.5
            
        for line in IN:
            line = line.strip()
            l = line.split("\t")
            
            # Filter conditions
            if not (float(l[-3]) > cutoff and float(l[-1]) < 0.05):
                continue
                
            if l[1] not in p:
                p[l[1]] = {}
            p[l[1]][l[0]] = float(l[-1])
            
        # Process each cluster
        for clu in sorted(p.keys()):
            tmp = p[clu]
            # Sort modules by p-value
            module = sorted(tmp.keys(), key=lambda m: tmp[m])
            
            info_str = ''
            n_count = 0
            p_order[clu] = module
            
            for m in module:
                info_str += f"{m};"
                n_count += 1
                
            sig[clu] = info_str
            n[clu] = n_count
            select[clu] = 0
            
            if module[0] not in record:
                record[module[0]] = {}
            record[module[0]][clu] = p[clu][module[0]]
            
            if module[0] not in count:
                count[module[0]] = 0
            count[module[0]] += 1

# Handle duplicate modules
for m in sorted(count.keys()):
    if count[m] > 1:
        tmp = record[m]
        # Sort clusters by p-value
        clusters = sorted(tmp.keys(), key=lambda c: tmp[c])
        
        for i in range(1, len(clusters)):
            clu = clusters[i]
            j = 1
            while j < len(p_order[clu]) and p_order[clu][j] in count:
                j += 1
                
            if j < len(p_order[clu]):
                select[clu] = j
                if p_order[clu][j] not in count:
                    count[p_order[clu][j]] = 0
                count[p_order[clu][j]] += 1

# Define cluster order
superclusters = [f"S{i}" for i in range(1, 7)]
regular_clusters = [f"C{i}" for i in range(7, 11)]
FRCs = []
FRCs.extend([f"S1_C{i}" for i in range(1, 29)])
FRCs.extend([f"S2_C{i}" for i in range(1, 9)])
FRCs.extend([f"S3_C{i}" for i in range(1, 6)])
FRCs.extend([f"S4_C{i}" for i in range(1, 5)])
FRCs.extend([f"S5_C{i}" for i in range(1, 5)])
FRCs.extend([f"S6_C{i}" for i in range(1, 4)])

# Combine all clusters in order
all_clusters = superclusters + regular_clusters + FRCs

# Output results to a file
output_file = f"{outdir}/cluster_module_signature.tsv"
with open(output_file, "w") as out_file:
    out_file.write("Cluster\tRepresentative module\tModule name\tCount\tSignature modules\n")
    
    # Write results in specified order
    for clu in all_clusters:
        if clu in p_order and len(p_order[clu]) > select[clu]:
            m = p_order[clu][select[clu]]
            if m in name:
                out_file.write(f"{clu}\t{m}\t{name[m]}\t{n[clu]}\t{sig[clu]}\n")
            else:
                out_file.write(f"{clu}\t{m}\tUnknown\t{n[clu]}\t{sig[clu]}\n")
        else:
            # Handle case when no corresponding module is found
            out_file.write(f"{clu}\tNo_significant_module\tNone\t0\t\n")

print(f"Results written to {output_file}")