In [None]:
***
# Prophage work

### (I) Predict prophages and call FastANI
### (II) Inspecting the output
***

> Running Phageboost and FastANI

In [None]:
import os
import random
import subprocess
from tqdm import tqdm

def run_phageboost(path_klebsiella, path_phageboost, good_strains_file, threads=4):
    """Run PhageBoost on all good strains to predict prophages."""
    good_strains = open(good_strains_file).read().split("\n")
    
    for specie in os.listdir(path_klebsiella):
        if specie.startswith("k") and os.path.isdir(f"{path_klebsiella}/{specie}"):
            strains = os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria")
            for strain in random.sample(strains, len(strains)):
                if strain in good_strains:
                    path_fna = f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.fna"
                    path_prophage = f"{path_phageboost}/{strain}"
                    
                    try:
                        os.mkdir(path_prophage)
                    except FileExistsError:
                        print(f"Output for {strain} already exists. Continuing...")
                    
                    if len(os.listdir(path_prophage)) == 0:
                        # Run PhageBoost
                        cmd = f"PhageBoost -f {path_fna} -o {path_prophage} --threads {threads}"
                        subprocess.run(cmd, shell=True)
                        
                        with open(f"{path_prophage}/process_done", "w") as outfile:
                            outfile.write("This strain has been studied")
                    else:
                        print(f"PhageBoost already completed for {strain}")
                        

def extract_phageboost_scores(path_phageboost, output_score_file):
    """Extract prophage prediction scores from PhageBoost results."""
    with open(output_score_file, "w") as outfile:
        for strain in os.listdir(f"{path_phageboost}/phageboost_prediction"):
            if len(os.listdir(f"{path_phageboost}/phageboost_prediction/{strain}")) > 2:
                for file in os.listdir(f"{path_phageboost}/phageboost_prediction/{strain}"):
                    if file.startswith("phages"):
                        info_file = open(f"{path_phageboost}/phageboost_prediction/{strain}/{file}").read().split("\n")[2:]
                        for info in info_file:
                            if info:
                                score = info.split("\t")[5]
                                outfile.write(f"{strain},{score}\n")
                                
                                
def prepare_fastani_input(path_phageboost_pred, path_fastANI, path_phageboot_info, strain_ktype_file, min_score=0.70):
    """Prepare input for FastANI based on prophage predictions with scores >= min_score."""
    # Read strain K-type data
    strain_ktype = {}
    good_strain = open(strain_ktype_file).read().split("\n")
    for info in good_strain:
        if info:
            strain = info.split("\t")[0].strip()
            ktype = info.split("\t")[2].strip()
            strain_ktype[strain] = ktype
    
    # Write the prophage information
    with open(f"{path_phageboot_info}/results_phageboost.{min_score}.tsv", "w") as outfile1:
        outfile1.write("Prophage_name\tProphage_length\tN_genes\tScore\tK_type\n")
        
        for strain in tqdm(os.listdir(path_phageboost_pred)):
            for file in os.listdir(f"{path_phageboost_pred}/{strain}"):
                if file.startswith("phages"):
                    try:
                        resume = pd.read_csv(f"{path_phageboost_pred}/{strain}/{file}", skiprows=1, sep="\t")
                    except Exception as e:
                        print(f"No prophage data for {strain}")
                        continue
                    
                    for _, info in resume.iterrows():
                        if float(info["score"]) >= min_score:
                            prophage_id = info["attributes"].split("phage_id=")[1]
                            prophage_len = int(info["end"]) - int(info["start"])
                            n_genes = info["attributes"].split("n_genes=")[1].split(";")[0]
                            
                            for file2 in os.listdir(f"{path_phageboost_pred}/{strain}"):
                                if prophage_id in file2:
                                    seq = open(f"{path_phageboost_pred}/{strain}/{file2}").read().split("\n")[1]
                                    if not os.path.isfile(f"{path_fastANI}/{strain}__{prophage_id}.fasta"):
                                        with open(f"{path_fastANI}/{strain}__{prophage_id}.fasta", "w") as outfile:
                                            outfile.write(f">{strain}__{prophage_id}\n{seq}")
                                    
                            outfile1.write(f"{strain}__{prophage_id}\t{prophage_len}\t{n_genes}\t{info['score']}\t{strain_ktype[strain]}\n")

                            
def write_fastani_list(path_fastANI, output_list_file):
    """Generate the FastANI list of sequences to be used for comparison."""
    with open(output_list_file, "w") as outfile:
        for file in tqdm(os.listdir(path_fastANI)):
            outfile.write(f"{path_fastANI}/{file}\n")
            
            
def run_fastani(ql_list, rl_list, output_file, threads=40):
    """Run FastANI using the provided query and reference lists."""
    fastani_cmd = (
        f"fastANI --ql {ql_list} --rl {rl_list} -o {output_file} --matrix -t {threads}"
    )
    
    try:
        subprocess.run(fastani_cmd, shell=True, check=True)
        print("FastANI completed successfully!")
    except subprocess.CalledProcessError as e:
        print(f"Error while running FastANI: {e}")
        
        
        
def main():
    # Define paths and files
    path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
    path_phageboost = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/phageboost/phageboost_prediction"
    good_strains_file = f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt"
    output_score_file = f"{path_phageboost}/score_distribution.phageboost.csv"
    path_fastANI = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
    path_phageboot_info = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_info"
    strain_ktype_file = f"{path_klebsiella}/results_kleborate_count.tsv"
    fastani_list_file = f"{path_phageboot_info}/fastANI_list.20102022.tsv"
    fastani_output_file = f"{path_klebsiella}/prophage_prediction/prophage_similarity/phageboost/fastANI_out_20102022"

    # Step 1: Prophage Annotation
    print("Starting prophage annotation with PhageBoost...")
    run_phageboost(path_klebsiella, path_phageboost, good_strains_file)

    # Step 2: Extract Prediction Scores
    print("Extracting PhageBoost prediction scores...")
    extract_phageboost_scores(path_phageboost, output_score_file)

    # Step 3: Prepare FastANI input files
    print("Preparing FastANI input files...")
    prepare_fastani_input(path_phageboost, path_fastANI, path_phageboot_info, strain_ktype_file)

    # Step 4: Write FastANI list
    print("Writing FastANI list...")
    write_fastani_list(path_fastANI, fastani_list_file)

    # Step 5: Run FastANI
    print("Running FastANI...")
    run_fastani(fastani_list_file, fastani_list_file, fastani_output_file)

    print("Pipeline complete.")

# Execute main function
if __name__ == "__main__":
    main()


> Inspecting the FastANI output

In [None]:
import os
import pandas as pd
from tqdm import tqdm

# Paths to data
path_fastani = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022_out"
path_phages = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
path_ktype = "/home/conchae/prediction_depolymerase_tropism"

# Load the fastANI results and filter for ANI ≥ 90
fastani_columns = ["Query", "Reference_genome", "ANI", "fragments", "total_fragments"]
fastani_df = pd.read_csv(f"{path_fastani}/fastANI_out_20102022", sep="\t", names=fastani_columns)
fastani_df = fastani_df[fastani_df["ANI"] >= 90]

# Initialize families
families = []
fastani_dict = fastani_df.to_dict('records')

# Build families of related prophages based on ANI ≥ 99 and coverage ≥ 80%
for row in tqdm(fastani_dict):
    ani = float(row["ANI"])
    coverage = float(row["fragments"]) / float(row["total_fragments"])

    # Check the ANI and coverage thresholds
    if ani >= 99 and coverage >= 0.80:
        # Get lengths of query and reference genomes
        l_query = len(open(row['Query']).read().split("\n")[1])
        l_refer = len(open(row['Reference_genome']).read().split("\n")[1])

        # Compare lengths and apply length threshold (within 20%)
        length_check = min(l_query, l_refer) / max(l_query, l_refer) >= 0.80

        if length_check:
            # Extract prophage IDs from paths
            prophage_1 = row["Query"].split("/")[-1]
            prophage_2 = row["Reference_genome"].split("/")[-1]
            pair = {prophage_1, prophage_2}

            # Merge clusters if they share prophages
            for cluster in families:
                if not cluster.isdisjoint(pair):
                    cluster.update(pair)
                    break
            else:
                families.append(pair)

# Save family clusters to files
output_clusters = f"{path_fastani}/clusters_99_80.info.2004.v2.tsv"
output_members = f"{path_fastani}/clusters_99_80.2004.v2.tsv"

with open(output_clusters, 'w') as outfile_cluster, open(output_members, 'w') as outfile_member:
    outfile_cluster.write("Family_index\tMember\n")
    outfile_member.write("Family_index\tMembers\n")

    for index_c, cluster in enumerate(families):
        outfile_member.write(f"{index_c}\t{','.join(cluster)}\n")
        for member in cluster:
            outfile_cluster.write(f"Family {index_c}\t{member}\n")

# Check integrity of cluster assignments
families_df = pd.read_csv(output_members, sep="\t")
families_set = [set(fam.split(",")) for fam in families_df["Members"]]

# Iterate over families to merge any overlapping clusters (n_iterations = 7)
n_iterations = 7
for _ in range(n_iterations):
    updated_families = []
    merged = [False] * len(families_set)

    for i, cluster in tqdm(enumerate(families_set)):
        if merged[i]:
            continue
        for j in range(i + 1, len(families_set)):
            if cluster.isdisjoint(families_set[j]) == False:
                cluster.update(families_set[j])
                merged[j] = True

        updated_families.append(cluster)

    families_set = updated_families

# Write the final cleaned clusters to file
cleaned_output = f"{path_fastani}/clusters_99_80.clean.2004.v2.tsv"
with open(cleaned_output, 'w') as outfile:
    outfile.write("Family_index\tMembers\n")
    for index_f, family in tqdm(enumerate(families_set)):
        cluster_list = ",".join(list(family))
        outfile.write(f"Family_{index_f}\t{cluster_list}\n")

# Handle any "loner" prophages that didn't cluster
loners = []
for phage in tqdm(os.listdir(path_phages)):
    for family in families_set:
        if phage in family:
            break
    else:
        loners.append(phage)

# Append loners to the final output
with open(cleaned_output, 'a') as outfile:
    for loner in loners:
        outfile.write(f"Loner\t{loner}\n")

# Integrity check
cluster_df = pd.read_csv(cleaned_output, sep="\t")
all_phages = []
for row in cluster_df.itertuples():
    all_phages.extend(row.Members.split(","))

loners_df = cluster_df[cluster_df["Family_index"] == "Loner"]
family_df = cluster_df[cluster_df["Family_index"] != "Loner"]

# Final cleanup and consistency check
final_output = f"{path_fastani}/clusters_99_80.extra_clean.2004.v2.tsv"
with open(final_output, 'w') as outfile:
    outfile.write("prophage_id\tprophage\n")
    for idx, row in tqdm(cluster_df.iterrows()):
        for member in row["Members"].split(","):
            outfile.write(f"prophage_{idx}\t{member}\n")