# **I. Klebsiella strains** <br>
### The point of this part is to go from the set of Klebsiella genomes to a phylogenetic tree.The work was mainly done through the PanACoTA pipeline (https://github.com/gem-pasteur/PanACoTA). Here are the steps : 
## 1. Download the Klebsiella genomes 
## 2. Capsule prediction by Kleborate.
####  Only the strains with at least a 'good' level of prediction will be kept for the rest of the process 
#### Screen the resistance genes in order to obtain a virulence score 
## 3. Annotate the genomes
## 4. Generate a pan-genome
## 5. Generate a core-genome
## 6. Align the core genome
## 7. Infer the phylogenetic tree from the previous alignment
    
https://github.com/gem-pasteur/PanACoTA
***

### 1. Download the Klebsiella genomes

In [None]:
# The prepare command in order to download all the Kpn sequences from refseq
# Changed the 573 to 570 to get all the genus of klebsiella

#!/bin/bash
#BATCH --job-name=k_indica
#SBATCH --partition=short 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=15
#SBATCH --mem=75gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=k_indica%j.log 

module restore la_base
module load singularity

singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img prepare -T 570 -o /home/conchae/prediction_depolymerase_tropism/

***
### 2. Capsule prediction with Kleborate 

In [None]:
# Kleborate step : 
# Annotation with Kleborate, we only keep the bacteria for which the K-type is clearly identified 


#Python version : 
import os
from os import system, listdir, mkdir
import random
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"


# Kleborate small :
for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and specie != "k_pneumoniae":
        for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init")):
            if fasta[-4:]==".fna":
                rep="_".join(fasta.split("_")[0:2])
                path_in=f"{path_klebsiella}/{specie}/Database_init"
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isfile(f"{path_out}/{rep}_Kaptive_out.txt") == False :
                    system(f"kleborate --kaptive_k --kaptive_k_outfile {path_out}/{rep}_Kaptive_out.txt -a {path_in}/{fasta}")

# Kleborate big :
for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and specie == "k_pneumoniae":
        for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init")):
            if fasta[-4:]==".fna":
                rep="_".join(fasta.split("_")[0:2])
                path_in=f"{path_klebsiella}/{specie}/Database_init"
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isfile(f"{path_out}/{rep}_Kaptive_out.txt") == False :
                    system(f"kleborate --kaptive_k --kaptive_k_outfile {path_out}/{rep}_Kaptive_out.txt -a {path_in}/{fasta}")

# SH version : 
#!/bin/bash
#BATCH --job-name=kleborate_big
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=75gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=big_kleborate%j.log 

module restore la_base
module load singularity
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate kaptive_env

python3 /home/conchae/prediction_depolymerase_tropism/script_files/py_file/kleborate_big.py

> Follow the advancement of the command 

In [None]:
# Follow-up script :
import os
from os import system, listdir, mkdir
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"


with open(f"{path_klebsiella}/completion_kleborate_step.txt","w") as outfile:
    n_total=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}"):
            n=len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))
            n_total=n_total+n
    n_done=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}"):
            for rep in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isfile(f"{path_out}/{rep}_Kaptive_out.txt") == True :
                    n_done=n_done+1
    ratio=(n_done/n_total)*100
    if ratio != 1 :
        #outfile.write("The operation is not completed")
        pass
    print("The operation is completed at :", str(ratio), "%")

> Writting the file ith the Kleborate results

In [None]:
# Kleborate resume file : 
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"     

k_specie={"k_aerogenes" : "Klebsiella aerogenes",
          "k_africana" : "Klebsiella africana",
          "k_grimatii" : "Klebsiella grimontii",
          "k_huaxiensis" : "Klebsiella huaxiensis",
          "k_indica" : "Klebsiella indica",
          "k_michiganesis" : "Klebsiella michiganensis",
          "k_oxytoca" : "Klebsiella oxytoca",
          "k_pasteurii" : "Klebsiella pasteurii",
          "k_pneumoniae" : "Klebsiella pneumoniae",
          "k_quasipneumoniae": "Klebsiella quasipneumoniae",
          "k_quasivariicola" : "Klebsiella quasivariicola",
          "k_spallanzanii" : "Klebsiella spallanzanii",
          "k_variicola" : "Klebsiella variicola"}

with open(f"{path_klebsiella}/kleborate_results_all.tsv", "w") as outfile :
    outfile.write("Accession \t Specie \t K-Serotype \t Confidence \t n Missing genes in K-Locus \t Genome size \t L90 \n")
    n_good=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            info_out=""
            for file in os.listdir(f"{path_klebsiella}/{specie}") :
                if file[0:7]=="LSTINFO":
                    info_out=open(f"{path_klebsiella}/{specie}/{file}").read()
            for rep in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                info_rep=""
                for index, info in enumerate(info_out.split("\n")) :
                    if info.count(rep)>0:
                        info_rep=info
                kaptive_out=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}/{rep}_Kaptive_out.txt").read()
                first_line=kaptive_out.split("\n")[0].split("\t")
                secon_line=kaptive_out.split("\n")[1].split("\t")
                kaptive_dic=dict(zip(first_line,secon_line))
                n_miss=len(kaptive_dic["Missing expected genes"].split(";"))-1
                #gsize=info_rep.split("\t")[1]
                #num_contigs=info_rep.split("\t")[2]
                #L_90=info_rep.split("\t")[3]
                if kaptive_dic["Match confidence"] != "None" and kaptive_dic["Match confidence"] != "Low":
                    #n_good=n_good+1
                    outfile.write(f"{rep} \t {k_specie[specie]} \t {kaptive_dic['Best match locus']} \t {kaptive_dic['Match confidence']} \t {str(n_miss)} \n")
    #print(n_good)
                        

> Compute the resistance for each strain 

In [None]:
# Kleborate step : 
# Annotation with Kleborate, we only keep the bacteria for which the K-type is clearly identified 


#Python version : 
import os
from os import system, listdir, mkdir
import random
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"
good_strain=open(f"{path_klebsiella}/results_kleborate_count.tsv").read().split("\n")


# Kleborate small :
for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and specie != "k_pneumoniae":
        for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init")):
            if fasta[-4:]==".fna":
                rep="_".join(fasta.split("_")[0:2])
                path_in=f"{path_klebsiella}/{specie}/Database_init"
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isfile(f"{path_out}/{rep}_Kaptive_out.txt") == False :
                    os.system(f"kleborate --resistance -a {path_in}/{fasta} -o  {path_out}/{rep}_resistance.txt")

# Kleborate big :
for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and specie == "k_pneumoniae":
        for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init")):
            if fasta[-4:]==".fna":
                rep="_".join(fasta.split("_")[0:2])
                path_in=f"{path_klebsiella}/{specie}/Database_init"
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isfile(f"{path_out}/{rep}_Kaptive_out.txt") == False :
                    os.system(f"kleborate --resistance -a {path_in}/{fasta} -o  {path_out}/{rep}_resistance.txt")

# SH version : 
#!/bin/bash
#BATCH --job-name=kleborate_big
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=75gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=big_kleborate%j.log 

module restore la_base
module load singularity
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate kaptive_env

python3 /home/conchae/prediction_depolymerase_tropism/script_files/py_file/kleborate_big.py

In [None]:
with open(f"{path_klebsiella}/kleborate_results_all.tsv", "w") as outfile :
    outfile.write("Accession \t Specie \t K-Serotype \t Confidence \t n Missing genes in K-Locus \t Genome size \t L90 \n")
    n_good=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            info_out=""
            for file in os.listdir(f"{path_klebsiella}/{specie}") :
                if file[0:7]=="LSTINFO":
                    info_out=open(f"{path_klebsiella}/{specie}/{file}").read()
            for rep in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                info_rep=""
                for index, info in enumerate(info_out.split("\n")) :
                    if info.count(rep)>0:
                        info_rep=info
                kaptive_out=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}/{rep}_Kaptive_out.txt").read()
                first_line=kaptive_out.split("\n")[0].split("\t")
                secon_line=kaptive_out.split("\n")[1].split("\t")
                kaptive_dic=dict(zip(first_line,secon_line))
                n_miss=len(kaptive_dic["Missing expected genes"].split(";"))-1
                #gsize=info_rep.split("\t")[1]
                #num_contigs=info_rep.split("\t")[2]
                #L_90=info_rep.split("\t")[3]
                if kaptive_dic["Match confidence"] != "None" and kaptive_dic["Match confidence"] != "Low":
                    #n_good=n_good+1
                    outfile.write(f"{rep} \t {k_specie[specie]} \t {kaptive_dic['Best match locus']} \t {kaptive_dic['Match confidence']} \t {str(n_miss)} \n")
    #print(n_good)
                

***
### 3. Annotate the genomes
Annotation with Prokka 

In [None]:
# Prokka annotation : 
# Python version 
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"

for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and specie == "k_pneumoniae":
        for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init")):
            if fasta[-4:]==".fna":
                rep="_".join(fasta.split("_")[0:2])
                path_in=f"{path_klebsiella}/{specie}/Database_init"
                path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                if os.path.isdir(f"{path_out}/prokka_annotation") == False :
                    # prokka
                    system(f"prokka {path_in}/{fasta} --norrna --notrna --outdir {path_out}/prokka_annotation --prefix {rep} --compliant --force --cpus 0 ")
                                   
# ****************************************************************************************************************************************************************                     
#!/bin/bash
#BATCH --job-name=prokka_small
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=75gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=small_prokka%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate annotation_prokka
export PERL5LIB=/home/conchae/.conda/envs/annotation_prokka/bin/perl                                   
                                   
python3 /home/conchae/prediction_depolymerase_tropism/script_files/py_file/prokka_annotation_small.py
                                   
cpan install XML::Simple

> Check how well the command is going 

In [None]:
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"                                   
with open(f"{path_klebsiella}/completion_prokka.txt","w") as outfile:
    n_total=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True :
            n=len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))
            n_total=n_total+n
    n_done=0
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True :
            for fasta in random.sample(os.listdir(f"{path_klebsiella}/{specie}/Database_init"), len(os.listdir(f"{path_klebsiella}/{specie}/Database_init"))):
                if fasta[-4:]==".fna":
                    rep="_".join(fasta.split("_")[0:2])
                    path_out=f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}"
                    if os.path.isdir(f"{path_out}/prokka_annotation_all") == True :
                        n_done=n_done+1
    ratio=(n_done/n_total)*100
    if ratio != 1 :
        #outfile.write("The operation is not completed")
        pass
    print("The operation is completed at :", str(ratio), "%")

***
### 4. Generate a pan-genome

In [None]:
# The panacota pangenome step :

# The list info file (-l) option :
# -l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"     


# Third version 
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"     


with open(f"{path_klebsiella}/panacota_pangenome_list_v2.1.txt", "w") as outfile :
    for specie in os.listdir(path_klebsiella):
        if specie=="k_pneumoniae" or specie=="k_quasipneumoniae" or specie=="k_variicola" or specie=="k_oxytoca" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            for rep in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                kaptive_out=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}/{rep}_Kaptive_out.txt").read()
                first_line=kaptive_out.split("\n")[0].split("\t")
                secon_line=kaptive_out.split("\n")[1].split("\t")
                kaptive_dic=dict(zip(first_line,secon_line))
                if kaptive_dic["Match confidence"] != "None" and kaptive_dic["Match confidence"] != "Low":
                    outfile.write(f"{rep}\n")

good_strains=open(f"{path_klebsiella}/panacota_pangenome_list_v2.txt").read().split("\n")
len(good_strains)
# **************************************************************************************************************************************************                    
# The dataset_name (-n) option :
# -n Klebsiella_genomes 

# # Getting all the protein files in the directory 
# -d /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random                                   
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_prot_files="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
        for strain in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
            if strain in good_strains :
                with open(f"{path_prot_files}/{strain}.prt","w") as outfile :
                    faa_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.faa").read().split(">")[1:]
                    for index, seq in enumerate(faa_file):
                        outfile.write(f">{strain}_{index} {seq}\n")
                        
                        
with open(f"{path_klebsiella}/pangenome_klebsiella_all.prt","w") as outfile :
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            for strain in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                if strain in good_strains :
                    faa_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.faa").read().split(">")[1:]
                    for index, seq in enumerate(faa_file):
                        outfile.write(f">{strain}_{index} {seq}\n")


# **************************************************************************************************************************************************                    
# The pangenome command :

#!/bin/bash
#BATCH --job-name=panacota_pangenome_2
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=35
#SBATCH --mem=75gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=panacota_v2%j.log 

module restore la_base
module load singularity

singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img pangenome  -l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt -n Klebsiella_genomes -d /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files -o /home/conchae/prediction_depolymerase_tropism/panacota_pangenome --threads 0 -v 


OR 

singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img pangenome  -l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list_v2.txt -n Klebsiella_genomes_v2 -d /home/conchae/prediction_depolymerase_tropism/panacota_pangenome_kp_kqp/protein_files -o /home/conchae/prediction_depolymerase_tropism/panacota_pangenome_kp_kqp --threads 0 -v 


***
### 5. Generate a core-genome

In [None]:
# The panacota core/pers step :

# The core/pers command :
#!/bin/bash
#BATCH --job-name=core_pers
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=35
#SBATCH --mem=75gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=core_pers%j.log 

module restore la_base
module load singularity

singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img corepers -p /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst -t 0.99 -o /home/conchae/prediction_depolymerase_tropism/panacota_core

# Try to use as criteria 95%
singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img corepers -p /home/conchae/prediction_depolymerase_tropism/panacota_pangenome_kp_kqp/PanGenome-Klebsiella_genomes_v2.1.All.prt-clust-0.8-mode1-th80.lst -t 0.99 -o /home/conchae/prediction_depolymerase_tropism/panacota_core_kp_kqp

*** 
### 6. Align the core genome

> The pre-align step 

In [None]:
# The panacota align step :

# The core genome file generated in the previous step 
# -c /home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst

# The list of genomes in the dataset :
# -l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt

# Name of the datset : 
# -n Klebsiella_genomes 

# path to the folder containing the directories : "Proteins" and "Genes"
# -d /home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes


from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random   
from Bio.Seq import Seq

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_prot_files="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

def seq_fasta_format(seq):
    seq_1=seq[:]
    if seq_1.count("\n")>0:
        seq_1="".join(seq_file.split("\n")[1:])
    else : 
        ("The sequence does have any backslashes. Let's continue...")
    seq2="\n".join(seq_1[i:i+61] for i in range(0,len(seq),61))
    return seq2 

for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
        for strain in random.sample(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"), len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))):
            if strain in good_strains :
                if os.path.isfile(f"{path_data}/Genes/{strain}.gen")==False:
                    with open(f"{path_data}/Genes/{strain}.gen","w") as outfile_ffn :
                        used_ffn=[]
                        faa_file=open(f"{path_prot}/{strain}.prt").read().split(">")[1:]
                        ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
                        for index_prot, seq_prot in enumerate(faa_file):
                            header=seq_prot.split("\n")[0]
                            fasta_faa="".join(seq_prot.split("\n")[1:])
                            func_faa="".join(seq_prot.split("\n")[0].split(" ")[1:])
                            index_max=len(ffn_file)-len(faa_file)+index_prot
                            for index_gen in range(index_prot,index_max) :
                                fasta_ffn="".join(ffn_file[index_gen].split("\n")[1:])
                                translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                                header_ffn=ffn_file[index_gen].split("\n")[0]
                                if len(fasta_ffn)%3 ==0 and len(fasta_faa)==len(translation):
                                    if translation==fasta_faa and header_ffn not in used_ffn:
                                        break
                            print("We might have it now :\n",strain,"\n",index_prot,"\n")
                            outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")

for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
        for strain in random.sample(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"), len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))):
            if strain in good_strains :
                if os.path.isfile(f"{path_data}/Genes/{strain}.gen")==False:
                    print("Something wrong with that one :",strain)
                            
# The pre_align commd :

#!/bin/bash
#BATCH --job-name=pre_align
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=75gb 
#SBATCH --time=3-00:00:00 
#SBATCH --output=pre_align%j.log 

module restore la_base
source  /home/conchae/bio_module/bin/activate

python3 /home/conchae/prediction_depolymerase_tropism/script_files/py_file/panacota_pre_align.py

# Follow the command : 
from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random   

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_prot_files="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

n_done= len(os.listdir(f"{path_data}/Genes"))
n_total= len(good_strains)

print("The process has been done at : ",(n_done/n_total)*100,"%")
    

# Output directory :
# -o /home/conchae/prediction_depolymerase_tropism/panacota_align

# Generating the right files for the align step ...

from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random
path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_prot_files="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

path_pana_core="/home/conchae/prediction_depolymerase_tropism/panacota_core"
            
info_core=open(f"{path_data}/core_genes.txt").read().split("\n")
for index_core,line in enumerate(info_core):
    strain=line.split("\t")[0]
    strains_target=[target for target in line.split("\t")[1:] if target]
    test_assay=[]
    if os.path.isfile(f"{path_data}/Genes_5/{strain}.gen")==False:
        with open(f"{path_data}/Genes_5/{strain}.gen", "w") as outfile :
            specie=""
            for species in os.listdir(path_klebsiella):
                if species[0]=="k" and os.path.isdir(f"{path_klebsiella}/{species}/refseq/bacteria/{strain}")== True :
                    specie=species
                    break
            faa_file=[faa for faa in open(f"{path_data}/Proteins/{strain}.prt").read().split(">") if faa]
            ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
            for index_faa, line_faa in enumerate(faa_file):
                strain_id=line_faa.split(" ")[0]
                prot_tag=line_faa.split(" ")[1]
                if strain_id in strains_target :
                    for index_ffn, line_ffn in enumerate(ffn_file) :
                        if line_ffn.count(prot_tag)>0 :
                            seq_ffn="\n".join(line_ffn.split("\n")[1:])
                            header=f">{strain_id} {prot_tag}\n{seq_ffn}"
                            test_assay.append(prot_tag)
                            outfile.write(header)
            if len(test_assay)==len(strains_target) :
                print("This strain is ok")
            else :
                print("This strain has gotten :", len(test_assay), "instead of : ",len(strains_target))

# delete the wrong ones :
info_core=open(f"{path_data}/core_genes.txt").read().split("\n")
for index_core,line in enumerate(info_core):
    strain=line.split("\t")[0]
    strains_target=[target for target in line.split("\t")[1:] if target]
    test_assay=[]
    if os.path.isfile(f"{path_data}/Genes_/{strain}.gen")==True:
        file=open(f"{path_data}/Genes_5/{strain}.gen").read().split(">")[1:]
        if len(file) != len(strains_target)/2 :
            print(strain, len(file), len(strains_target))
            system(f"rm {path_data}/Genes_5/{strain}.gen")
        else :
            print("Ok let's go", strain)


> The align step 

In [None]:
# The align command :

#!/bin/bash
#BATCH --job-name=align
#SBATCH --partition=medium 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=100gb 
#SBATCH --time=7-00:00:00 
#SBATCH --output=align%j.log 

module restore la_base
module load singularity


singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img align  -c /home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst -l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt -n Klebsiella_genomes -d /home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes -o /home/conchae/prediction_depolymerase_tropism/panacota_align --threads 0 -v 


singularity run /home/conchae/prediction_depolymerase_tropism/panacota.img align 
-c /home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst
-l /home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt
-n Klebsiella_genomes
-d /home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes
-o /home/conchae/prediction_depolymerase_tropism/panacota_align
--threads 0
-v
-P

# something wrong : 

#log file :

from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random   
from Bio.Seq import Seq

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_align_log="/home/conchae/prediction_depolymerase_tropism/panacota_align/PanACoTA-align_Klebsiella_genomes.log.err"
path_align="/home/conchae/prediction_depolymerase_tropism/panacota_align"
path_list_fam="/home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst"

good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

log_file=open(path_align_log).read().split("\n")[0:-2]
print(len(log_file))
for index, line in enumerate(log_file):
    if line :
        start=line.split(" ")[12][1:-1]
        stop=line.split(" ")[16][1:-1]
        n=int(start)+int(stop)
        print(n,start,stop)
        
list_families=open(path_list_fam).read()

# Is the len of the .prt the same as the .gen files ?
# No : 
for specie in os.listdir(path_klebsiella):
    if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
        for strain in random.sample(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"), len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))):
            if strain in good_strains :
                if os.path.isfile(f"{path_data}/Genes/{strain}.gen")==False and os.path.isfile(f"{path_data}/Proteins/{strain}.prt")==False:
                    print("Something is DEEPLY wrong with this kid : ",specie, strain)
                elif os.path.isfile(f"{path_data}/Genes/{strain}.gen")==False :
                    print("Gene file is wrong for him : ",specie, strain)
                elif os.path.isfile(f"{path_data}/Proteins/{strain}.prt")==False :
                    print("Prot file is wrong for him : ",specie, strain)
                faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")
                ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")
                if len(ffn_file) != len(faa_file) :
                    print("Wrong with this one : ",len(ffn_file),len(faa_file) ,specie, strain)
                else : 
                    pass

wrong_files={}
wrong_txt=open(f"{path_align}/wrong_files").read().split("\n")
for index, line in enumerate(wrong_txt):
    if line :
        strain=line.split(" ")[-1]
        specie=line.split(" ")[-2]
        wrong_files[strain]=specie
    
for strain in wrong_files:
    specie=wrong_files[strain]
    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")
    ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")
    if len(ffn_file) != len(faa_file) :
        print("Following strain is wrong :",specie, strain, "\n",len(faa_file), len(ffn_file))
        with open(f"{path_data}/Genes/{strain}.gen","w") as outfile_ffn :
            used_ffn=[]
            faa_file=open(f"{path_prot}/{strain}.prt").read().split(">")[1:]
            ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
            for index_prot, seq_prot in enumerate(faa_file):
                header=seq_prot.split("\n")[0]
                fasta_faa="".join(seq_prot.split("\n")[1:])
                func_faa="".join(seq_prot.split("\n")[0].split(" ")[1:])
                index_max=len(ffn_file)-len(faa_file)+index_prot
                for index_gen in range(index_prot,index_max) :
                    fasta_ffn="".join(ffn_file[index_gen].split("\n")[1:])
                    translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                    header_ffn=ffn_file[index_gen].split("\n")[0]
                    if len(fasta_ffn)%3 ==0 and len(fasta_faa)==len(translation):
                        if translation==fasta_faa and header_ffn not in used_ffn:
                            break
                print("We might have it now :\n",specie,strain,"\n",index_prot,"\n")
                outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")
    else :
        print("Seems like ",specie, strain, "is fixed")

for strain in wrong_files:
    specie=wrong_files[strain]
    with open(f"{path_data}/Genes/{strain}.gen","w") as outfile_ffn :
        used_ffn=[]
        faa_file=open(f"{path_prot}/{strain}.prt").read().split(">")[1:]
        ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
        for index_prot, seq_prot in enumerate(faa_file):
            header=seq_prot.split("\n")[0]
            fasta_faa="".join(seq_prot.split("\n")[1:])
            func_faa="".join(seq_prot.split("\n")[0].split(" ")[1:])
            index_max=len(ffn_file)-len(faa_file)+index_prot
            for index_gen in range(index_prot,index_max) :
                fasta_ffn="".join(ffn_file[index_gen].split("\n")[1:])
                translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                header_ffn=ffn_file[index_gen].split("\n")[0]
                if len(fasta_ffn)%3 ==0 and len(fasta_faa)==len(translation):
                    if translation==fasta_faa and header_ffn not in used_ffn:
                        break
            print("We might have it now :\n",specie,strain,"\n",index_prot,"\n")
            outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")
# Are all the genomes in the good_strains have their file in the data folder ?
# Yes
'''for genome in good_strains:
    if os.path.isfile(f"{path_data}/Genes/{genome}.gen")== False or os.path.isfile(f"{path_data}/Proteins/{genome}.prt")== False:
        print(genome)'''


# The proteins from the same family do not have the same lengths

from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random   
from Bio.Seq import Seq

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_align_log="/home/conchae/prediction_depolymerase_tropism/panacota_align/PanACoTA-align_Klebsiella_genomes.log.err"
path_align="/home/conchae/prediction_depolymerase_tropism/panacota_align"
path_list_fam="/home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst"

wrong_genes={}
for file in os.listdir(f"{path_data}/Genes"):
    strain=file.split(".gen")[0]
    cont_genes=[]
    wrong_genes[strain]=cont_genes
    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")[1:]
    ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")[1:]
    for index_gen, seq_gen in enumerate(ffn_file):
        header_seq=seq_gen.split("\n")[0]
        for index_prt, seq_prot in enumerate(faa_file):
            header_prt=seq_prot.split("\n")[0]
            if header_prt == header_seq :
                fasta_ffn="".join(seq_gen.split("\n")[1:])
                fasta_faa="".join(seq_prot.split("\n")[1:])
                translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                if translation == fasta_faa :
                    pass
                else :
                    wrong_genes[strain].append(header_seq)
                    print("That one is wrong :", strain , header_seq ,"\n", fasta_faa , "\n",translation)
                        
                    
# *********************************************************************************************
# It's all wrong, let's revise that  :

from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random   
from Bio.Seq import Seq

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   
path_prot_files="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
path_data="/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes"
path_prot="/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/protein_files"
good_strains=open(f"{path_klebsiella}/panacota_pangenome/panacota_pangenome_list.txt").read().split("\n")

def seq_fasta_format(seq):
    seq_1=seq[:]
    if seq_1.count("\n")>0:
        seq_1="".join(seq_file.split("\n")[1:])
    else : 
        ("The sequence does have any backslashes. Let's continue...")
    seq2="\n".join(seq_1[i:i+61] for i in range(0,len(seq),61))
    return seq2 

for strain in wrong_files:
    specie=wrong_files[strain]
    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")
    ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")
    if len(ffn_file) != len(faa_file) :
        print("Following strain is wrong :",specie, strain, "\n",len(faa_file), len(ffn_file))
        with open(f"{path_data}/Genes/{strain}.gen","w") as outfile_ffn :
            used_ffn=[]
            faa_file=open(f"{path_prot}/{strain}.prt").read().split(">")[1:]
            ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
            for index_prot, seq_prot in enumerate(faa_file):                   # Creating the .gen file :
                with open(f"{path_data}/Genes_2/{strain}.gen","w") as outfile_ffn :
                    used_ffn=[]
                    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")[1:]
                    ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
                    # Going through the protein file
                    for index_prot, seq_prot in enumerate(faa_file):
                        header=seq_prot.split("\n")[0]
                        fasta_faa="".join(seq_prot.split("\n")[1:])
                        func_faa="".join(seq_prot.split("\n")[0].split(" ")[1:])
                        index_max=len(ffn_file)-len(faa_file)+index_prot
                        index_min=index_prot-10
                        for index_gen in range(index_min,index_max) :
                            fasta_ffn="".join(ffn_file[index_gen].split("\n")[1:])
                            translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                            header_ffn=ffn_file[index_gen].split("\n")[0]
                            if len(fasta_ffn)%3 ==0 and len(fasta_faa)==len(translation):
                                if translation==fasta_faa and header_ffn not in used_ffn:
                                    print(fasta_faa, "\n", translation , "\n\n", fasta_ffn, "\n\n\n\n")
                                    used_ffn.append(header_ffn)
                                    outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")
                                    break
                                else :
                                    print("Nothing to see here.")
                        print("We might have it now :\n",strain,"\n",index_prot,"\n")

with open(f"{path_data}/gen_files_control.tsv","a+") as outfile_c :
    for specie in os.listdir(path_klebsiella):
        # Test on a specific specie first
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            for strain in random.sample(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"), len(os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"))):
                if strain in good_strains :
                    if os.path.isfile(f"{path_data}/Genes/{strain}.gen")==False:
                        # Creating the .gen file :
                        with open(f"{path_data}/Genes/{strain}.gen","w") as outfile_ffn :
                            used_ffn=()
                            all_headers_faa=()
                            used_headers_faa=()
                            headers_ffn=()
                            faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")[1:]
                            ffn_file=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{strain}/prokka_annotation_all/{strain}.ffn").read().split(">")[1:]
                            expected_len=len(faa_file)
                            # Going through the protein file
                            for index_prot, seq_prot in enumerate(faa_file):
                                header=seq_prot.split("\n")[0]
                                fasta_faa="".join(seq_prot.split("\n")[1:]).strip()
                                func_faa="".join(seq_prot.split("\n")[0].split(" ")[1:])
                                index_max=len(faa_file)
                                index_min=index_prot-250
                                all_headers_faa=all_headers_faa+(header,)
                                for index_gen in range(index_min,index_max) :
                                    fasta_ffn="".join(ffn_file[index_gen].split("\n")[1:])
                                    translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                                    header_ffn=ffn_file[index_gen].split("\n")[0]
                                    if len(fasta_ffn)%3 ==0 and len(fasta_faa)==len(translation):
                                        if translation==fasta_faa and header_ffn not in used_ffn:
                                            #print(fasta_faa, "\n", translation , "\n\n", fasta_ffn, "\n\n\n\n")
                                            used_ffn=used_ffn+(header_ffn,)
                                            used_headers_faa=used_headers_faa+(header,)
                                            #print("Looks good here","\n",header)
                                            outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")
                                            break
                                        else :
                                            #print("Nothing to see here. But maybe ...",strain)
                                            #print(translation,"\n",fasta_faa,header,"\n\n\n")
                                            pass
                            if len(used_headers_faa)==len(all_headers_faa)==expected_len :
                                outfile_c.write("Seems like this one is good to go :",specie, strain)
                            else :
                                outfile_c.write(f"Possible trouble with the following : {specie} {strain},. Len faa file : {len(faa_file)}. And len of the .gen file :,{len(used_headers_faa)}")
                                print("Possible trouble with the following :",specie, strain,". Len faa file :",len(faa_file), ", And len of the .gen file :",len(used_headers_faa))
                                for test_header in all_headers_faa :
                                    if test_header not in used_headers_faa :
                                        print(test_header)

                            
# Fixing the last ones :

wrong_genes={}
for file in os.listdir(f"{path_data}/Genes"):
    strain=file.split(".gen")[0]
    cont_genes=[]
    wrong_genes[strain]=cont_genes
    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")[1:]
    ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")[1:]
    for index_gen, seq_gen in enumerate(ffn_file):
        header_seq=seq_gen.split("\n")[0]
        for index_prt, seq_prot in enumerate(faa_file):
            header_prt=seq_prot.split("\n")[0]
            if header_prt == header_seq :
                fasta_ffn="".join(seq_gen.split("\n")[1:])
                fasta_faa="".join(seq_prot.split("\n")[1:])
                translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                if translation == fasta_faa :
                    pass
                else :
                    wrong_genes[strain].append(header_seq)
                    print("That one is wrong :", strain , header_seq ,"\n", fasta_faa , "\n",translation)
                    continue
    print("Strain :",strain, "finshed")
                        
wrong_genes={}
for file in os.listdir(f"{path_data}/Genes"):
    strain=file.split(".gen")[0]
    cont_genes=[]
    wrong_genes[strain]=cont_genes
    faa_file=open(f"{path_data}/Proteins/{strain}.prt").read().split(">")[1:]
    ffn_file=open(f"{path_data}/Genes/{strain}.gen").read().split(">")[1:]
    for index_gen, seq_gen in enumerate(ffn_file):
        header_seq=seq_gen.split("\n")[0]
        for index_prt, seq_prot in enumerate(faa_file):
            header_prt=seq_prot.split("\n")[0]
            if header_prt == header_seq :
                fasta_ffn="".join(seq_gen.split("\n")[1:])
                fasta_faa="".join(seq_prot.split("\n")[1:])
                translation=str(Seq(fasta_ffn).translate())[0:-1].strip()
                if len(translation) == len(fasta_faa) :
                    print("Might work")
                    pass
                else :
                    wrong_genes[strain].append(header_seq)
                    print("That one is wrong :", strain , header_seq ,"\n",fasta_faa ,"\n",translation)
                    continue
    print("Strain :",strain, "finshed")

***
### 7. Infer the phylogenetic tree from the previous alignment
The tree has been computed directly with iqtree

In [None]:
# Local tree : 
path_local="/home/conchae/prediction_depolymerase_tropism/iqtree_local"


iqtree 
-s /home/conchae/prediction_depolymerase_tropism/iqtree_local/Klebsiella_genomes.nucl.grp.aln -m MFP  --prefix Klensiella_genomes_MFP -alrt 1000  -T AUTO  -B -v

iqtree 
-s /home/conchae/prediction_depolymerase_tropism/iqtree_local/Klebsiella_genomes.nucl.grp.aln
-m GTR+F+I
--prefix Klensiella_genomes_MFP
-T AUTO 
-B 1000 
-alrt 1000
#!/bin/bash
#BATCH --job-name=MFP_local_tree
#SBATCH --partition=long 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60
#SBATCH --mem=200gb 
#SBATCH --time=10-00:00:00 
#SBATCH --output=MFP_local_tree%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate Treebuilding

iqtree -s /home/conchae/prediction_depolymerase_tropism/iqtree_local/Klebsiella_genomes.nucl.grp.aln -m MFP  --prefix Klensiella_genomes_MFP -T AUTO  -B 1000 

#!/bin/bash
#BATCH --job-name=Fixed_tree
#SBATCH --partition=long 
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60
#SBATCH --mem=200gb 
#SBATCH --time=10-00:00:00 
#SBATCH --output=Fixed_Tree%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate Treebuilding

iqtree  
-s /home/conchae/prediction_depolymerase_tropism/iqtree_local/Klebsiella_genomes.nucl.grp.aln  -m GTR+F+I  --prefix Klensiella_genomes_fixed  -B 1000  -alrt 1000 -t BIONJ /home/conchae/prediction_depolymerase_tropism/iqtree_local/script_files/Klensiella_genomes_fixed.bionj -nt AUTO

> Writting tips file for the tree

In [None]:
# Writting the tips file :

from os import system, listdir, chdir, mkdir
from os.path import isdir
import os
import random

path_klebsiella="/home/conchae/prediction_depolymerase_tropism"   

with open(f"{path_klebsiella}/pastML/pastML_KL_tips.comma.txt", "w") as outfile :
    for specie in os.listdir(path_klebsiella):
        if specie[0]=="k" and os.path.isdir(f"{path_klebsiella}/{specie}")== True:
            for rep in os.listdir(f"{path_klebsiella}/{specie}/refseq/bacteria"):
                kaptive_out=open(f"{path_klebsiella}/{specie}/refseq/bacteria/{rep}/{rep}_Kaptive_out.txt").read()
                first_line=kaptive_out.split("\n")[0].split("\t")
                secon_line=kaptive_out.split("\n")[1].split("\t")
                kaptive_dic=dict(zip(first_line,secon_line))
                if kaptive_dic["Match confidence"] != "None" and kaptive_dic["Match confidence"] != "Low":
                    outfile.write(f"{rep},{kaptive_dic['Best match locus']}\n")

> Check how well supported the tree is

In [None]:
# Check how wel the tree is supported :


from os import system, listdir, chdir, mkdir
from os.path import isdir
import os

path_tree="/home/conchae/prediction_depolymerase_tropism/iqtree_local/tree_files"

tree=[value.split(":")[0] for value in open(f"{path_tree}/Klensiella_genomes_fixed.2.1.treefile").read().split(")")]

sh_alrt=[float(num.split("/")[0]) for num in tree if num.count("/")>0 and float(num.split("/")[0])!=0.0]
bootstrap=[float(num.split("/")[1]) for num in tree if num.count("/")>0 and float(num.split("/")[0])!=0.0]

def Average(lst):
    return sum(lst) / len(lst)

df = pd.Dataframe(bootstrap_values)._convert(numeric=True)