# Build the phylogenetic tree
***
## Run the first steps of the panacotta pipeline :
### I pangenome
### II corepers
### Prepare the files for the alignment
### III align

## Make phylogenetic tree with IQtree

***

In [None]:
import subprocess
import os

def run_pangenome():
    """Run the pangenome command using singularity."""
    try:
        cmd = [
            "singularity", "run", "/home/conchae/prediction_depolymerase_tropism/panacota.img", "pangenome",
            "-l", "/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list_v2.txt",
            "-n", "Klebsiella_genomes_v2",
            "-d", "/home/conchae/prediction_depolymerase_tropism/panacota_pangenome_kp_kqp/protein_files",
            "-o", "/home/conchae/prediction_depolymerase_tropism/panacota_pangenome_kp_kqp",
            "--threads", "0", "-v"
        ]

        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Pangenome output:\n{result.stdout}")

    except subprocess.CalledProcessError as e:
        print(f"Error running pangenome: {e.stderr}")
    except Exception as ex:
        print(f"An unexpected error occurred: {ex}")
        

def run_coregenome():
    """Run the coregenome command using singularity."""
    try:
        cmd = [
            "singularity", "run", "/home/conchae/prediction_depolymerase_tropism/panacota.img", "corepers",
            "-p", "/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst",
            "-t", "0.99",
            "-o", "/home/conchae/prediction_depolymerase_tropism/panacota_core"
        ]

        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Coregenome output:\n{result.stdout}")

    except subprocess.CalledProcessError as e:
        print(f"Error running coregenome: {e.stderr}")
    except Exception as ex:
        print(f"An unexpected error occurred: {ex}")

# Main execution
if __name__ == "__main__":
    run_pangenome()  # Run the pangenome command
    run_coregenome()  # Run the coregenome command

In [None]:
import os
import random
from Bio.Seq import Seq

# Define paths
path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
path_prot_files = os.path.join(path_klebsiella, "panacota_pangenome/protein_files")
path_data = os.path.join(path_klebsiella, "panacota_align/data_prot_genes")
good_strains = open(os.path.join(path_klebsiella, "panacota_pangenome/panacota_pangenome_list.txt")).read().split("\n")


# Utility function to format sequences in FASTA format
def seq_fasta_format(seq):
    """Format the sequence for FASTA output."""
    if "\n" in seq:
        seq = "".join(seq.split("\n")[1:])
    return "\n".join([seq[i:i + 61] for i in range(0, len(seq), 61)])


# Processing function to match proteins and genes
def process_strain(strain, specie, faa_file, ffn_file):
    """Match protein sequences with gene sequences, and save the results in a .gen file."""
    gen_filepath = os.path.join(path_data, "Genes", f"{strain}.gen")
    
    # Check if the output file already exists
    if not os.path.isfile(gen_filepath):
        with open(gen_filepath, "w") as outfile_ffn:
            used_ffn = []  # Track used gene sequences
            for index_prot, seq_prot in enumerate(faa_file):
                header = seq_prot.split("\n")[0]
                fasta_faa = "".join(seq_prot.split("\n")[1:]).strip()
                
                # Search for matching gene sequences
                for index_gen, seq_gen in enumerate(ffn_file):
                    fasta_ffn = "".join(seq_gen.split("\n")[1:])
                    translation = str(Seq(fasta_ffn).translate()).strip()

                    # Check if the gene sequence matches the protein
                    if len(fasta_ffn) % 3 == 0 and len(fasta_faa) == len(translation) and translation == fasta_faa:
                        if ffn_file[index_gen].split("\n")[0] not in used_ffn:
                            used_ffn.append(ffn_file[index_gen].split("\n")[0])
                            outfile_ffn.write(f">{header}\n{seq_fasta_format(fasta_ffn)}\n")
                            break  # Found the match, break out of the inner loop
            print(f"Finished processing strain: {strain}")
            

# Function to process all species and strains
def process_species_and_strains():
    """Iterate through species and strains, processing each strain."""
    for specie in os.listdir(path_klebsiella):
        if specie.startswith("k") and os.path.isdir(os.path.join(path_klebsiella, specie)):
            strain_list = os.listdir(os.path.join(path_klebsiella, specie, "refseq/bacteria"))
            for strain in random.sample(strain_list, len(strain_list)):
                if strain in good_strains:
                    faa_file_path = os.path.join(path_prot_files, f"{strain}.prt")
                    ffn_file_path = os.path.join(path_klebsiella, specie, "refseq/bacteria", strain, "prokka_annotation_all", f"{strain}.ffn")

                    # Check if both protein and gene files exist
                    if os.path.isfile(faa_file_path) and os.path.isfile(ffn_file_path):
                        faa_file = open(faa_file_path).read().split(">")[1:]
                        ffn_file = open(ffn_file_path).read().split(">")[1:]

                        process_strain(strain, specie, faa_file, ffn_file)
                    else:
                        print(f"Missing files for strain: {strain}")


# Main execution
if __name__ == "__main__":
    process_species_and_strains()  # Process species and strains



In [None]:
import subprocess
import os

def run_alignment():
    """Run the alignment command using singularity."""
    try:
        cmd = [
            "singularity", "run", "/home/conchae/prediction_depolymerase_tropism/panacota.img", "align",
            "-c", "/home/conchae/prediction_depolymerase_tropism/panacota_core/PersGenome_PanGenome-Klebsiella_genomes.All.prt-clust-0.8-mode1-th80.lst-all_0.99.lst",
            "-l", "/home/conchae/prediction_depolymerase_tropism/panacota_pangenome/panacota_pangenome_list.txt",
            "-n", "Klebsiella_genomes",
            "-d", "/home/conchae/prediction_depolymerase_tropism/panacota_align/data_prot_genes",
            "-o", "/home/conchae/prediction_depolymerase_tropism/panacota_align",
            "--threads", "0",
            "-v", "-P"
        ]

        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Alignment output:\n{result.stdout}")

    except subprocess.CalledProcessError as e:
        print(f"Error running alignment: {e.stderr}")
    except Exception as ex:
        print(f"An unexpected error occurred: {ex}")

# Main execution
if __name__ == "__main__":
    run_alignment()  # Run the alignment command

In [None]:
def run_iqtree():
    """Run the IQ-TREE command to generate a phylogenetic tree."""
    try:
        cmd = [
            "iqtree",
            "-s", "/home/conchae/prediction_depolymerase_tropism/iqtree_local/Klebsiella_genomes.nucl.grp.aln",
            "-m", "GTR+F+I",
            "--prefix", "Klensiella_genomes_fixed",
            "-B", "1000",
            "-alrt", "1000",
            "-t", "BIONJ", 
            "/home/conchae/prediction_depolymerase_tropism/iqtree_local/script_files/Klensiella_genomes_fixed.bionj",
            "-nt", "AUTO"
        ]

        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"IQ-TREE output:\n{result.stdout}")

    except subprocess.CalledProcessError as e:
        print(f"Error running IQ-TREE: {e.stderr}")
    except Exception as ex:
        print(f"An unexpected error occurred: {ex}")

# Main execution
if __name__ == "__main__":
    run_iqtree()  # Run the IQ-TREE command