# Collecting and processing the Klebsiella genomic data
***
### (I) Collecting the genomic data of Klebsiella genomes (run_singularity)
### (II) Running Kleborate on fasta (run_kleborate_on_fasta)
### (III) Genetic annotation of genomes correctly annotated (run_prokka_annotation)
***

In [None]:
import subprocess
import os 
import random

def run_singularity(img_path: str, t_ncbi: int, output_path: str):
    """
    Runs the singularity command with specified image, tropism value, and output directory.

    Parameters:
    img_path : Path to the singularity image /panacota.img.
    t_ncbi : the taxid provided by the NCBI for the species you want to download
    output_path : Output directory path. In there, A folder called refseq/bacteria containing: 
    1 folder per assembly and, inside, the assembly sequence in fasta.gz format, and the MD5SUMS of this file.
    
    Returns:
    None: Outputs will be saved in the specified output directory.
    """
    try:
        command = [
            "singularity", "run", img_path, 
            "prepare", "-T", str(t_ncbi), 
            "-o", output_path
        ]
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print(result.stdout)  
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while running the command: {e.stderr}")
        

def run_kleborate_on_fasta(path_klebsiella: str):
    """
    Runs Kleborate on .fna files found in the Database_init folder for each species in the provided path.
    Parameters:
    path_klebsiella (str): The base path containing species directories with FASTA files.
    Returns:
    None: Outputs are saved in their respective output directories.
    """
    try:
        for specie in os.listdir(path_klebsiella):
            specie_path = os.path.join(path_klebsiella, specie)
            init_path = os.path.join(specie_path, "Database_init")
            
            # Ensure the Database_init directory exists
            if not os.path.exists(init_path):
                print(f"Directory {init_path} does not exist. Skipping...")
                continue
            
            for fasta in os.listdir(init_path):
                if fasta.endswith(".fna"):
                    rep = "_".join(fasta.split("_")[0:2])
                    path_in = init_path
                    path_out = os.path.join(specie_path, "refseq", "bacteria", rep)
                    kaptive_out_file = os.path.join(path_out, f"{rep}_Kaptive_out.txt")
                    
                    if not os.path.isfile(kaptive_out_file):
                        try:
                            command = [
                                "kleborate", "--kaptive_k", 
                                "--kaptive_k_outfile", kaptive_out_file, 
                                "-a", os.path.join(path_in, fasta)
                            ]
                            # Run kleborate command
                            subprocess.run(command, check=True, capture_output=True, text=True)
                            print(f"Processed: {fasta}")
                        except subprocess.CalledProcessError as e:
                            print(f"Error while running kleborate on {fasta}: {e.stderr}")
                    else:
                        print(f"Output file {kaptive_out_file} already exists. Skipping...")
    except Exception as e:
        print(f"An error occurred: {e}")

        
def process_kleborate_results(path_klebsiella: str, k_specie: dict):
    """
    Processes Kleborate results and compiles information from various species into a single TSV file.

    Parameters:
    path_klebsiella (str): The base path containing species directories.
    k_specie (dict): Dictionary mapping species codes to full species names.
    
    Returns:
    None: Outputs are written to a kleborate_results_all.tsv file.
    """
    output_file = os.path.join(path_klebsiella, "kleborate_results_all.tsv")
    
    try:
        with open(output_file, "w") as outfile:
            # Write the header line for the output file
            outfile.write("Accession\tSpecie\tK-Serotype\tConfidence\tn Missing genes in K-Locus\n")
            
            for specie in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie)
                
                # Process only directories that start with 'k'
                if specie.startswith("k") and os.path.isdir(specie_path):
                    info_out = ""
                    
                    # Look for the LSTINFO file in the species directory
                    for file in os.listdir(specie_path):
                        if file.startswith("LSTINFO"):
                            lstinfo_path = os.path.join(specie_path, file)
                            with open(lstinfo_path, "r") as lstinfo_file:
                                info_out = lstinfo_file.read()
                    
                    # Process each representative (rep) in the refseq/bacteria directory
                    refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                    
                    if not os.path.isdir(refseq_path):
                        print(f"Refseq directory not found for {specie}. Skipping...")
                        continue
                    
                    for rep in os.listdir(refseq_path):
                        info_rep = ""
                        
                        # Match info_rep based on the rep name in the LSTINFO data
                        for index, info in enumerate(info_out.split("\n")):
                            if rep in info:
                                info_rep = info
                                break
                        
                        # Read the Kaptive output for the current rep
                        kaptive_out_file = os.path.join(refseq_path, rep, f"{rep}_Kaptive_out.txt")
                        
                        if not os.path.isfile(kaptive_out_file):
                            print(f"Kaptive output file not found for {rep}. Skipping...")
                            continue
                        
                        with open(kaptive_out_file, "r") as kaptive_file:
                            kaptive_out = kaptive_file.read().splitlines()
                        
                        # Parse the Kaptive output data
                        if len(kaptive_out) < 2:
                            print(f"Incomplete Kaptive output for {rep}. Skipping...")
                            continue
                        
                        first_line = kaptive_out[0].split("\t")
                        second_line = kaptive_out[1].split("\t")
                        
                        kaptive_dic = dict(zip(first_line, second_line))
                        n_miss = len(kaptive_dic["Missing expected genes"].split(";")) - 1
                        
                        # Only consider entries with medium or high confidence
                        if kaptive_dic["Match confidence"] not in ["None", "Low"]:
                            # Write the relevant information to the output file
                            outfile.write(f"{rep}\t{k_specie.get(specie, 'Unknown')}\t"
                                          f"{kaptive_dic['Best match locus']}\t"
                                          f"{kaptive_dic['Match confidence']}\t"
                                          f"{n_miss}\n")
                            print(f"Processed: {rep}")
    except Exception as e:
        print(f"An error occurred: {e}")


def run_prokka_annotation(path_klebsiella: str, target_species: str = "k_pneumoniae"):
    """
    Runs Prokka annotation on .fna files for a specific species in the Klebsiella dataset.

    Parameters:
    path_klebsiella : Base path containing species directories with FASTA files.
    target_species : The species to run Prokka annotation for (default is k_pneumoniae).
    
    Returns:
    None: Outputs will be written to the corresponding directories.
    """
    try:
        for specie in os.listdir(path_klebsiella):
            specie_path = os.path.join(path_klebsiella, specie)

            # Process only the target species
            if specie.startswith("k") and specie == target_species:
                db_init_path = os.path.join(specie_path, "Database_init")

                # Ensure the Database_init directory exists
                if not os.path.isdir(db_init_path):
                    print(f"Database_init directory not found for {specie}. Skipping...")
                    continue

                # Select a random sample of .fna files in the directory
                fasta_files = [f for f in os.listdir(db_init_path) if f.endswith(".fna")]
                
                for fasta in random.sample(fasta_files, len(fasta_files)):
                    rep = "_".join(fasta.split("_")[0:2])
                    path_in = db_init_path
                    path_out = os.path.join(specie_path, "refseq", "bacteria", rep)

                    # Create output directory if it doesn't exist
                    prokka_out_path = os.path.join(path_out, "prokka_annotation")
                    if not os.path.isdir(prokka_out_path):
                        os.makedirs(prokka_out_path, exist_ok=True)

                        # Build and run the Prokka command
                        prokka_command = [
                            "prokka", os.path.join(path_in, fasta), 
                            "--norrna", "--notrna", 
                            "--outdir", prokka_out_path, 
                            "--prefix", rep, 
                            "--compliant", "--force", 
                            "--cpus", "0"
                        ]

                        try:
                            # Run the Prokka annotation command
                            subprocess.run(prokka_command, check=True, capture_output=True, text=True)
                            print(f"Prokka annotation completed for: {fasta}")
                        except subprocess.CalledProcessError as e:
                            print(f"Error running Prokka on {fasta}: {e.stderr}")
                    else:
                        print(f"Prokka annotation already exists for: {rep}. Skipping...")

    except Exception as e:
        print(f"An error occurred: {e}")
        
        

In [None]:
def main(path_klebsiella: str, singularity_img_path: str, t_ncbi: int):
    """
    Main function to run singularity, Kleborate, and Prokka annotation on Klebsiella species.

    Parameters:
    path_klebsiella : Base path containing species directories.
    singularity_img_path : Path to the singularity image.
    t_ncbi : NCBI taxid for the species to download.
    
    Returns:
    None
    """
    # Dictionary mapping Klebsiella species codes to full species names
    k_specie = {
        "k_aerogenes": "Klebsiella aerogenes",
        "k_africana": "Klebsiella africana",
        "k_grimatii": "Klebsiella grimontii",
        "k_huaxiensis": "Klebsiella huaxiensis",
        "k_indica": "Klebsiella indica",
        "k_michiganesis": "Klebsiella michiganensis",
        "k_oxytoca": "Klebsiella oxytoca",
        "k_pasteurii": "Klebsiella pasteurii",
        "k_pneumoniae": "Klebsiella pneumoniae",
        "k_quasipneumoniae": "Klebsiella quasipneumoniae",
        "k_quasivariicola": "Klebsiella quasivariicola",
        "k_spallanzanii": "Klebsiella spallanzanii",
        "k_variicola": "Klebsiella variicola"
    }
    
    try:
        # Step 1: Run the singularity command to prepare the environment
        print("Running Singularity to prepare the dataset...")
        run_singularity(singularity_img_path, t_ncbi, path_klebsiella)
        print("Singularity command completed successfully.\n")

        # Step 2: Run Kleborate on the FASTA files in the Klebsiella dataset
        print("Running Kleborate on FASTA files...")
        run_kleborate_on_fasta(path_klebsiella)
        print("Kleborate analysis completed successfully.\n")
        for specie in k_specie : 
            
            # Step 3: Process the Kleborate results and compile them into a single TSV file
            print("Processing Kleborate results...")
            process_kleborate_results(path_klebsiella, specie)
            print("Kleborate results processed and saved successfully.\n")

            # Step 4: Run Prokka annotation on the Klebsiella species of interest
            print("Running Prokka annotation on FASTA files...")
            run_prokka_annotation(path_klebsiella, specie)
            print("Prokka annotation completed successfully.\n")
    
    except Exception as e:
        print(f"An error occurred during the main process: {e}")


if __name__ == "__main__":
    # Path to the Klebsiella dataset
    path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
    # Path to the singularity image
    singularity_img_path = "/home/conchae/prediction_depolymerase_tropism/panacota.img"
    # NCBI taxid for the species to download (example value)
    t_ncbi = 570  # Replace with the actual NCBI taxid
    # Run the main function
    main(path_klebsiella, singularity_img_path, t_ncbi)