# **II. Prophages**<br>
### The goal is to predict the prophages and to identify the strains in which they are present
## 1. Prophage prediction 
## 2. FastANI process
## 3. Inspecting FastANI output


***
### 1. Prophage prediction 

> The prediction command

In [None]:
# Prophage annotation : 
#***********************************************************  
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"
path_phageboost="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/phageboost/phageboost_prediction"

good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
        for strain in random.sample(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"), len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))):
            if strain in good_strains :
                path_fna=f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.fna"
                path_prophage=f"{path_phageboost}/{strain}"
                try :
                    mkdir(path_prophage)
                except FileExistsError :
                    print("The output for phageboost already exists for some reason. We shall continue")
                if len(os.listdir(f"{path_prophage}")) == 0:
                    system(f"PhageBoost -f {path_fna} -o {path_prophage}  --threads 4")
                    with open(f"{path_prophage}/process_done","w") as outfile:
                        outfile.write("This strain has been studied")
                    
#!/bin/bash
#BATCH --job-name=PhageBoost_cmd
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=125gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=PhageBoost_cmd%j.log 


source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate PhageBoost-env

python3 /home/conchae/prediction_depolymerase_tropism/script_files/prophage_prediction/phageboost_script.py


> Getting the prediction score for each prophage

In [None]:
# Writting some info files ...
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os

path_phageboost="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/phageboost"

with open(f"{path_phageboost}/score_distribution.phageboost.csv","w") as outfile:
    for strain in os.listdir(f"{path_phageboost}/phageboost_prediction") :
        if len(os.listdir(f"{path_phageboost}/phageboost_prediction/{strain}")) > 2 :
            for file in os.listdir(f"{path_phageboost}/phageboost_prediction/{strain}"):
                if file[0:6]=="phages":
                    info_file=open(f"{path_phageboost}/phageboost_prediction/{strain}/{file}").read().split("\n")[2:]
                    for index_info, info in enumerate(info_file):
                        if info :
                            score=info.split("\t")[5]
                            outfile.write(f"{strain},{score}\n")



***
### 2. FastANI computation

<div class="alert alert-block alert-success">

> The actual command 

In [None]:
# *******************************************************************************************************************************************
# The fastANI command :
# *******************************************************************************************************************************************

import os
import pandas as pd
from tqdm import tqdm

path_phageboot_info="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_info"
path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"
path_fastANI_2="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
path_ktype="/home/conchae/prediction_depolymerase_tropism"


    # But first, the strain_ktype dictionary :
strain_ktype={}
good_strain=open(f"{path_ktype}/results_kleborate_count.tsv").read().split("\n")
for index_strain, info in enumerate(good_strain):
    if info:
        strain=info.split("\t")[0].strip()
        ktype=info.split("\t")[2].strip()
        strain_ktype[strain]=ktype

with open(f"{path_phageboot_info}/results_phageboost.70.20102022.tsv","w") as outfile1 :
    outfile1.write(f"Prophage_name\tProphage_length\tN_genes\tScore\tK_type\n")
    for strain in tqdm(os.listdir(path_phageboost_pred)):
        # Opening the resume file of phageboost prediction :
        for file in os.listdir(f"{path_phageboost_pred}/{strain}"):
            if file[0:6]=="phages":
                try :
                    resume= pd.read_csv(f"{path_phageboost_pred}/{strain}/{file}", skiprows=1, sep="\t")
                except Exception as e:
                    print(f"Seems like there is no prophage for ")
                #Scanning the file for phage with a score > 0.70
                for index_info, info in resume.iterrows():
                    if float(info["score"])>= 0.70 :
                        # Getting the prophage info :
                        prophage_id= info["attributes"].split("phage_id=")[1]
                        prophage_len= int(info["start"]) -int(info["end"])
                        n_genes= info["attributes"].split("n_genes=")[1].split(";")[0]
                        for file2 in os.listdir(f"{path_phageboost_pred}/{strain}"):
                            if file2.count(prophage_id)>0:
                                seq=open(f"{path_phageboost_pred}/{strain}/{file2}").read().split("\n")[1]
                                if os.path.isfile(f"{path_fastANI_2}/{strain}__{prophage_id}.fasta")==False:
                                    with open(f"{path_fastANI_2}/{strain}__{prophage_id}.fasta","w") as outfile :
                                        outfile.write(f">{strain}__{prophage_id}\n{seq}")
                        outfile1.write(f"{strain}__{prophage_id}\t{str(prophage_len)}\t{n_genes}\t{info['score']}\t{strain_ktype[strain]}\n")





#*****************************************************************************************************************************************
#!/bin/bash
#BATCH --job-name=writting_phb
#SBATCH --partition=small 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5
#SBATCH --mem=15gb 
#SBATCH --time=0-05:00:00 
#SBATCH --output=writting_phb%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

python3 /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/script_files/py_files/prepare_fastani.20102022.py

#*****************************************************************************************************************************************        
# Step 2 :
# Writting the path file :

with open(f"{path_phageboot_info}/fastANI_list.20102022.tsv","w") as outfile :
    for file in tqdm(os.listdir(path_fastANI_2)):
        outfile.write(f"{path_fastANI_2}/{file}\n")
        
        
#*****************************************************************************************************************************************    
# Step 3 :        
# fatANI commands : 

#!/bin/bash
#BATCH --job-name=fatANI_phb
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=200gb 
#SBATCH --time=4-00:00:00 
#SBATCH --output=fatANI_phb%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate fastani

fastANI  --ql /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_info/fastANI_list.20102022.tsv --rl /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_info/fastANI_list.20102022.tsv -o /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_out_20102022  --matrix  -t 40

***
### 3. Inspecting FastANI output 

> First round inspection : Get the pairs of prophages with a ANI score>0.99 and coverage > 80%


In [None]:
# *******************************************************************************************************************************************
# Inspecting the fastANI outputs :
# *******************************************************************************************************************************************

import os
import pandas as pd
from tqdm import tqdm


path_fastani="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022_out"
path_ktype="/home/conchae/prediction_depolymerase_tropism"

# Generating the dico with the k_type info for each strain
strain_ktype={}
good_strain=open(f"{path_ktype}/results_kleborate_count.tsv").read().split("\n")
for index_strain, info in enumerate(good_strain):
    if info:
        strain=info.split("\t")[0].strip()
        ktype=info.split("\t")[2].strip()
        strain_ktype[strain]=ktype
    
fastani_names = ["Query","Reference_genome","ANI","fragments","total_fragments"]
fastani_df = pd.read_csv(f"{path_fastani}/fastANI_out_20102022",sep="\t", names = fastani_names, nrows=10000)


families = []
fastani_dict = fastani_df.to_dict('records')
for row in tqdm(fastani_dict) :
    if float(row["ANI"]) >=99 and float(row["fragments"])/float(row["total_fragments"])>=0.80:
        prophage_1 = row["Query"].split("/")[-1]
        prophage_2 = row["Reference_genome"].split("/")[-1]
        pair = [prophage_1, prophage_2]
        for cluster in families :
            if prophage_1 in cluster or prophage_2 in cluster: 
                cluster.add(prophage_1)
                cluster.add(prophage_2)
                break
        else :
            cluster = set()
            cluster.add(prophage_1)
            cluster.add(prophage_2)
            families.append(cluster)

with open(f"{path_fastani}/clusters_99_80.info.tsv",'w') as outfile :
    with open(f"{path_fastani}/clusters_99_80.tsv",'w') as outfile_cluster :
        outfile.write("Family_index\tMember\n")
        outfile_cluster.write("Family_index\tMembers\n")
        for index_c, cluster in enumerate(families) :
            outfile_cluster.write(f"{index_c}\t")
            cluster_c_l = []
            for member in cluster :
                outfile.write(f"family {index_c}\t{member}\n")
                cluster_c_l.append(member)
            outfile_cluster.write(",".join(cluster_c_l))
            outfile_cluster.write("\n")

# *******************************************************************************************************************************************                    
#!/bin/bash
#BATCH --job-name=post_ANI2_
#SBATCH --partition=short 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=10
#SBATCH --mem=10gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=post_ANI2_%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

python3 /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/script_files/py_files/post_fastani.20102022.py
# *******************************************************************************************************************************************                    

> Fix the families 

In [None]:
# Check the integrety of the DF :

import os
import pandas as pd
from tqdm import tqdm


path_fastani="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022_out"
path_phages = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
path_ktype="/home/conchae/prediction_depolymerase_tropism"

families = pd.read_csv(f"{path_fastani}/clusters_99_80.tsv", header = 0, sep='\t')
families_set = [set(fam.split(",")) for fam in families["Members"]]

clean_families = []
for index_set, cluster in tqdm(enumerate(families_set)) :
    clean_cluster = cluster.copy()
    #print(clean_cluster)
    for index_2, cluster_2 in enumerate(families_set):
        if clean_cluster.isdisjoint(cluster_2) == False :
            clean_cluster.update(cluster_2)
            continue
        else :
            continue
    #print(clean_cluster)
    if clean_cluster not in clean_families :
        clean_families.append(clean_cluster)
        

# Repeat the iteration : 
clean_families_2 = []
for index_set, cluster in tqdm(enumerate(clean_families)) :
    clean_cluster = cluster.copy()
    for index_2, cluster_2 in enumerate(clean_families):
        if clean_cluster.isdisjoint(cluster_2) == False :
            clean_cluster.update(cluster_2)
            continue
        else :
            continue
    if clean_cluster not in clean_families_2 :
        clean_families_2.append(clean_cluster)

# *******************************************************************************************************************************************************************
n_iteration = 10
clean_families = []
for n in range(n_iteration):
    tmp_families = []
    for index_set, cluster in tqdm(enumerate(clean_families)) :
        clean_cluster = cluster.copy()
        #print(clean_cluster)
        for index_2, cluster_2 in enumerate(families_set):
            if clean_cluster.isdisjoint(cluster_2) == False :
                clean_cluster.update(cluster_2)
                continue
            else :
                continue
        #print(clean_cluster)
        if clean_cluster not in clean_families :
            clean_families.append(clean_cluster)
    
    

# *******************************************************************************************************************************************************************
        
with open(f"{path_fastani}/clusters_99_80.clean.tsv","w") as outfile :
    outfile.write("Family_index\tMembers\n")
    phages = set()
    for index_f, family in tqdm(enumerate(clean_families_2)) :
        cluster_list = ",".join(list(family))
        outfile.write(f"Family_{index_f}\t{cluster_list}\n")
    for phage in tqdm(os.listdir(path_phages)):
        for index_f, family in enumerate(clean_families_2):
            if phage in family :
                break
        else :
            outfile.write(f"Loner\t{phage}\n")
            
            
# Check the integrity of the files :
cluster = pd.read_csv(f"{path_fastani}/clusters_99_80.clean.tsv", header = 0, sep="\t")
phages = []
cluster_dict = cluster.to_dict("records")

for row in tqdm(cluster_dict) :
    for member in row["Members"].split(",") :
        phages.append(member)
        
loners_df = cluster[cluster["Family_index"]=="Loner"]
fammmm_df = cluster[cluster["Family_index"]!="Loner"]

> Create directory phageboost with the new prophage name 

In [None]:
# Pre step : 
# Create a tmp with all the candidates :

import os
import pandas as pd
import random
import sys
import subprocess
from tqdm import tqdm


path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"
path_fasta="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
path_fastani="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_out_20102022"
path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_ktype="/home/conchae/prediction_depolymerase_tropism"


# Generating the dico with the k_type info for each strain
strain_ktype={}
good_strain=open(f"{path_ktype}/results_kleborate_count.tsv").read().split("\n")
for index_strain, info in enumerate(good_strain):
    if info:
        strain=info.split("\t")[0].strip()
        ktype=info.split("\t")[1].strip()
        strain_ktype[strain]=ktype
        

for phage in tqdm(os.listdir(path_fasta)):       
    strain=phage.split("\t")[0].split("__")[0]
    prophage_id=phage.split("\t")[0].split("__")[1].split(".fasta")[0]
    prophage=phage.split("\t")[0].split(".fasta")[0]
    print(strain,prophage_id,prophage)
    try :
        os.mkdir(f"{path_decipher}/{strain}")
    except FileExistsError:
        pass
    if os.path.isfile(f"{path_decipher}/{strain}/{prophage}")== False :
        copy_fasta = f"cp {path_fasta}/{prophage}.fasta {path_decipher}/{strain}/{prophage}.fasta"
        copy_fasta_process = subprocess.Popen(copy_fasta, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        press_out, press_err = copy_fasta_process.communicate()
    genecall=pd.read_csv(f"{path_phageboost_pred}/{strain}/genecalls_{strain.split('.')[0]}.gff3", sep="\t")
    for file in os.listdir(f"{path_phageboost_pred}/{strain}"):
        if file[0:6]=="phages":
            resume= pd.read_csv(f"{path_phageboost_pred}/{strain}/{file}", skiprows=1, sep="\t")
            break
    print(resume)
    attributes = resume["attributes"].to_list()
    for index_att, attri in enumerate(attributes):
        if attri.split("phage_id=")[1] == prophage_id :
            attribute_line = attri
            break
    frag_id = resume[resume["attributes"] == attribute_line]["#seqid"].values[0]
    start_genome ,stop_genome =resume[resume["attributes"] == attribute_line]["start"].values[0], resume[resume["attributes"] == attribute_line]["end"].values[0]
    print(frag_id ,start_genome, stop_genome)
    #break
    genecall_frag = genecall[genecall["contig"] == frag_id]
    genecall_frag_dict = genecall_frag.to_dict('records')
    with open(f"{path_decipher}/{strain}/{prophage_id}.multi.candidates.faa", "w") as outfile_faa :
        with open(f"{path_decipher}/{strain}/{prophage_id}.multi.candidates.ffn", "w") as outfile_ffn :
            for line in genecall_frag_dict : 
                if line["start"] in range(start_genome, stop_genome) :
                    nt_seq, aa_seq, prot_id =line["DNAseq"] , line["AAseq"],  line["id"]
                    print(len(aa_seq), "Protein_id : ", prot_id)
                    if len(aa_seq) > 200 :
                        outfile_faa.write(f">{strain}__{prophage_id}__{prot_id}\n{aa_seq}\n")
                        outfile_ffn.write(f">{strain}__{prophage_id}__{prot_id}\n{nt_seq}\n")
                        print(f">{strain}__{prophage_id}__{prot_id}\n{aa_seq}\n", f">{strain}__{prophage_id}__{prot_id}\n{nt_seq}\n")
                
    
# *******************************************************************************************************************************************************************
#!/bin/bash
#BATCH --job-name=candidates
#SBATCH --partition=short 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=10
#SBATCH --mem=10gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=candidates%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/script_files/py_files/post_fastani_pt3.py
# *******************************************************************************************************************************************************************
import os
import pandas as pd
import random
import sys
import subprocess
from tqdm import tqdm


path_phageboost_pred="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_prediction"
path_fasta="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_20102022"
path_fastani="/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/fastANI_out_20102022"
path_decipher = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_ktype="/home/conchae/prediction_depolymerase_tropism"


for strain in os.listdir(path_decipher):
    try :
        os.mkdir (f"{path_decipher}/{strain}/hmmer_out")
        os.mkdir (f"{path_decipher}/{strain}/tmp")
    except FileExistsError :
        pass
    hmmer_out=f"{path_decipher}/{strain}/hmmer_out"
    tmp=f"{path_decipher}/{strain}/tmp"
    for file in os.listdir(f"{path_decipher}/{strain}"):
        if file[-14:]=="candidates.faa":
            candidates= open(f"{path_decipher}/{strain}/{file}").read().split(">")
            prophage_name=file.split(".")[0]
            try :
                os.mkdir (f"{tmp}/{prophage_name}")
                os.mkdir (f"{hmmer_out}/{prophage_name}")
            except FileExistsError :
                pass
            path_out=f"{hmmer_out}/{prophage_name}"
            for index_seq, seq_faa in enumerate(candidates) :
                if seq_faa :
                    seq_name=seq_faa.split("\n")[0]
                    if os.path.isfile(f"{tmp}/{prophage_name}/{seq_name}.fasta")== False :
                        with open(f"{tmp}/{prophage_name}/{seq_name}.fasta","w") as outfile :
                            outfile.write(f">{seq_faa}")

# *******************************************************************************************************************************************
#!/bin/bash
#BATCH --job-name=tmp_file
#SBATCH --partition=short 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5
#SBATCH --mem=10gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=tmp_file%j.log 

module restore la_base                                
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP
                                                      
python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_similarity/phageboost/script_files/py_files/create_tmp.py