Plot keystone figures

In [None]:
import os 
import sys
#id pheno
indir = "../result/large_scale_cohort"
leave_path = "../result/GCN_fix_tree/leaves_cluster.tsv"
outdir = "../result/keystone"
if not os.path.exists(outdir):
    os.makedirs(outdir)
plist = ['ACVD',
 'BD',
 'CRC',
 'IBD',
 'IGT',
 'CFS',
 'STH',
 'T2D',
 'adenoma',
 'asthma',
 'carcinoma_surgery_history',
 'hypertension',
 'migraine',
 'schizofrenia']    


In [None]:
import os 
import sys
import pandas as pd

# Parameters
indir = "../result/large_scale_cohort"
leave_path = "../result/GCN_fix_tree/leaves_cluster.tsv"
outdir = "../result/keystone"
if not os.path.exists(outdir):
    os.makedirs(outdir)

plist = ['ACVD', 'BD', 'CRC', 'IBD', 'IGT', 'CFS', 'STH', 'T2D', 'adenoma', 
         'asthma', 'carcinoma_surgery_history', 'hypertension', 'migraine', 'schizofrenia']

def process_keystone_analysis(cohort_id, disease, indir, leave_path, outdir):
    """
    Process keystone analysis for a given cohort and disease
    """
    # File paths
    sp_dir = f"{indir}/{disease}/{cohort_id}/sp"
    diff_file = f"{indir}/{disease}/{cohort_id}/{cohort_id}.abundance.wilcox_testing.tsv"
    
    groups = ['Health', disease]
    n = 30
    
    # Output files
    keystone_file = f"{outdir}/{cohort_id}.keystone.species.list"
    group_pr_file = f"{outdir}/{cohort_id}.group_PR.tsv"
    diff_out_file = f"{outdir}/{cohort_id}.diff.tsv"
    
    print(f"Processing {cohort_id}")
    
    # Read name mapping
    newtax = {}
    try:
        with open("species.name_short.txt", 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    tax, new_tax = parts[0], parts[1]
                    newtax[f"s__{tax}"] = new_tax
    except FileNotFoundError:
        print("Warning:species.name_short.txt file not found")
    
    # Process each group
    species_info = {}
    pr_info = {}
    total_pr = {}
    eigen_c_dict = {}
    
    for g in groups:
        keystone_file_path = f"{sp_dir}/cluster_{g}/keystone_node.tsv"
        
        if not os.path.exists(keystone_file_path):
            print(f"Warning: {keystone_file_path} not found")
            continue
            
        # Find eigen_c
        eigen_c = None
        with open(keystone_file_path, 'r') as f:
            header = f.readline()  # Skip header
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 6:
                    spe, l, pr, eigen, c, p = parts
                    if eigen == 'True' and l == '0':
                        eigen_c = p
                        break
        
        eigen_c_dict[g] = eigen_c
        
        # Process species
        with open(keystone_file_path, 'r') as f:
            header = f.readline()  # Skip header
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 6:
                    spe, l, pr, eigen, c, p = parts
                    
                    # Skip if we can't convert pr to float (likely header or malformed data)
                    try:
                        pr_value = float(pr)
                    except ValueError:
                        continue
                    
                    if p == eigen_c and l == '0':
                        if spe not in species_info:
                            species_info[spe] = {}
                        
                        if eigen == 'True':
                            species_info[spe][g] = 2
                            print(f"{spe}\t{g}")
                        else:
                            species_info[spe][g] = 1
                    
                    if spe not in pr_info:
                        pr_info[spe] = {}
                    pr_info[spe][g] = pr_value
                    
                    if spe.startswith('s__'):
                        if spe not in total_pr:
                            total_pr[spe] = 0
                        total_pr[spe] += pr_value
    
    # Write keystone species list
    with open(keystone_file, 'w') as f:
        for g in groups:
            keystone_file_path = f"{sp_dir}/cluster_{g}/keystone_node.tsv"
            if os.path.exists(keystone_file_path):
                with open(keystone_file_path, 'r') as kf:
                    header = kf.readline()  # Skip header
                    for line in kf:
                        parts = line.strip().split('\t')
                        if len(parts) >= 6:
                            spe, l, pr, eigen, c, p = parts
                            if eigen == 'True' and l == '0':
                                mapped_name = newtax.get(spe, spe)
                                f.write(f"{mapped_name}\n")
    
    # Fill missing values
    for g in groups:
        for spe in species_info:
            if g not in species_info[spe]:
                species_info[spe][g] = 0
    
    # Get top species by PageRank
    rank_in = {}
    count = 0
    for spe in sorted(total_pr.keys(), key=lambda x: total_pr[x], reverse=True):
        count += 1
        if count > n * 2:
            break
        rank_in[spe] = 1
    
    # Process differential abundance results
    if not os.path.exists(diff_file):
        print(f"Warning: {diff_file} not found")
        return
    
    diff_df = pd.read_csv(diff_file, sep='\t')
    
    # Write group PR results
    with open(group_pr_file, 'w') as f:
        f.write("Taxa\tPR_control\tPR_case\teigen_control\teigen_case\n")
        
        for _, row in diff_df.iterrows():
            tax = row.iloc[0]  # First column is taxa
            
            if tax not in rank_in and tax not in species_info:
                continue
            
            fdr = row.iloc[-1]  # Last column is FDR
            case, control = row.iloc[2], row.iloc[3]  # Assuming these are case/control columns
            
            if tax in species_info:
                eigen_control = species_info[tax].get(groups[0], 0)
                eigen_case = species_info[tax].get(groups[1], 0)
            else:
                eigen_control, eigen_case = 0, 0
            
            pr_control = pr_info.get(tax, {}).get(groups[0], 0)
            pr_case = pr_info.get(tax, {}).get(groups[1], 0)
            
            mapped_name = newtax.get(tax, tax)
            f.write(f"{mapped_name}\t{pr_control:.4f}\t{pr_case:.4f}\t{eigen_control}\t{eigen_case}\n")
    
    # Read phylum colors
    in_phy = {}
    try:
        with open("phylum.color", 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    phy, color = parts[0], parts[1]
                    in_phy[phy] = 1
    except FileNotFoundError:
        print("Warning: phylum.color file not found")
    
    # Write diff results
    with open(diff_out_file, 'w') as f:
        f.write("Taxa\tFeature\tRes\n")
        
        # Process phylum information
        try:
            with open("../data/cMD.select_2008.species_phylum.tsv", 'r') as pf:
                for line in pf:
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        tax, phy = parts[0], parts[1]
                        
                        if tax not in rank_in and tax not in species_info:
                            continue
                        
                        mapped_name = newtax.get(tax, tax)
                        if phy in in_phy:
                            out_phy = phy.replace('p__', '')
                            f.write(f"{mapped_name}\tPhylum\t{out_phy}\n")
                        else:
                            f.write(f"{mapped_name}\tPhylum\tOther\n")
        except FileNotFoundError:
            print("Warning: species_phylum file not found")
        
        # Process cluster information
        try:
            cluster_df = pd.read_csv(leave_path, sep='\t')
            for _, row in cluster_df.iterrows():
                tax = str(row.iloc[0]).replace('-', '_')
                clu = row.iloc[1] if pd.notna(row.iloc[1]) else 'Other'
                
                if tax not in rank_in and tax not in species_info:
                    continue
                
                mapped_name = newtax.get(tax, tax)
                f.write(f"{mapped_name}\tCluster\t{clu}\n")
        except FileNotFoundError:
            print(f"Warning: {leave_path} not found")

# Main processing loop
for disease in plist:
    dir_path = os.path.join(indir, disease)
    if not os.path.exists(dir_path):
        continue
        
    for cohort_id in os.listdir(dir_path):
        cohort_dir = os.path.join(dir_path, cohort_id)
        if not os.path.isdir(cohort_dir):
            continue
        
        try:
            process_keystone_analysis(cohort_id, disease, indir, leave_path, outdir)
        except Exception as e:
            print(f"Error processing {cohort_id} in {disease}: {str(e)}")
            continue

print("Keystone analysis completed!")

In [None]:
# check if Rscript exists
python_executable = sys.executable
python_dir = os.path.dirname(python_executable)
rscript_path = os.path.join(python_dir, 'Rscript')
if os.path.exists(rscript_path):
    print(f"Rscript path: {rscript_path}")
else:
    print(f"Could not find Rscript in Python directory: {rscript_path}")
    print("Please ensure R is installed in your conda environment")

In [None]:
for disease in plist:
    dir_path = os.path.join(indir, disease)
    for id in os.listdir(dir_path):
        cohort_dir = os.path.join(dir_path, id)
        if not os.path.isdir(cohort_dir):
            continue
        prefix = "{}/{}".format(outdir, id)
        e = os.system("Rscript page_rank.R {} {}".format(prefix, id))
        