# Collecting and processing the Klebsiella genomic data
***
### I Collecting the genomic data of Klebsiella genomes (run_singularity)
### II Running Kleborate on fasta (run_kleborate_on_fasta)
### III Genetic annotation of genomes correctly annotated (run_prokka_annotation)
***

In [None]:
import subprocess
import os 
import random

def run_singularity(img_path: str, t_ncbi: int, output_path: str):
    """
    Runs the singularity command with specified image, taxid, and output directory.

    Parameters:
    img_path : Path to the singularity image /panacota.img.
    t_ncbi : NCBI taxid for the species to download.
    output_path : Output directory path for the species.
    """
    try:
        command = [
            "singularity", "run", img_path, 
            "prepare", "-T", str(t_ncbi), 
            "-o", output_path
        ]
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print(result.stdout)  
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while running the command: {e.stderr}")

def run_kleborate_on_fasta(path_klebsiella: str):
    """
    Runs Kleborate on .fna files in each species' Database_init directory.
    """
    try:
        for specie in os.listdir(path_klebsiella):
            specie_path = os.path.join(path_klebsiella, specie)
            init_path = os.path.join(specie_path, "Database_init")
            
            if not os.path.exists(init_path):
                print(f"Directory {init_path} does not exist. Skipping...")
                continue
            
            for fasta in os.listdir(init_path):
                if fasta.endswith(".fna"):
                    rep = "_".join(fasta.split("_")[0:2])
                    path_in = init_path
                    path_out = os.path.join(specie_path, "refseq", "bacteria", rep)
                    kaptive_out_file = os.path.join(path_out, f"{rep}_Kaptive_out.txt")
                    
                    if not os.path.isfile(kaptive_out_file):
                        try:
                            command = [
                                "kleborate", "--kaptive_k", 
                                "--kaptive_k_outfile", kaptive_out_file, 
                                "-a", os.path.join(path_in, fasta)
                            ]
                            subprocess.run(command, check=True, capture_output=True, text=True)
                            print(f"Processed: {fasta}")
                        except subprocess.CalledProcessError as e:
                            print(f"Error running kleborate on {fasta}: {e.stderr}")
                    else:
                        print(f"Output file {kaptive_out_file} already exists. Skipping...")
    except Exception as e:
        print(f"An error occurred: {e}")

def process_kleborate_results(path_klebsiella: str, code_to_name: dict):
    """
    Processes Kleborate results using the code_to_name mapping.
    """
    output_file = os.path.join(path_klebsiella, "kleborate_results_all.tsv")
    
    try:
        with open(output_file, "w") as outfile:
            outfile.write("Accession\tSpecie\tK-Serotype\tConfidence\tn Missing genes in K-Locus\n")
            
            for specie in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie)
                
                if specie.startswith("k") and os.path.isdir(specie_path):
                    info_out = ""
                    for file in os.listdir(specie_path):
                        if file.startswith("LSTINFO"):
                            with open(os.path.join(specie_path, file), "r") as f:
                                info_out = f.read()
                    
                    refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                    if not os.path.isdir(refseq_path):
                        print(f"Refseq directory not found for {specie}. Skipping...")
                        continue
                    
                    for rep in os.listdir(refseq_path):
                        info_rep = ""
                        for index, info in enumerate(info_out.split("\n")):
                            if rep in info:
                                info_rep = info
                                break
                        
                        kaptive_out_file = os.path.join(refseq_path, rep, f"{rep}_Kaptive_out.txt")
                        if not os.path.isfile(kaptive_out_file):
                            print(f"Kaptive output not found for {rep}. Skipping...")
                            continue
                        
                        with open(kaptive_out_file, "r") as kf:
                            kaptive_out = kf.read().splitlines()
                        
                        if len(kaptive_out) < 2:
                            print(f"Incomplete Kaptive output for {rep}. Skipping...")
                            continue
                        
                        first_line = kaptive_out[0].split("\t")
                        second_line = kaptive_out[1].split("\t")
                        kaptive_dic = dict(zip(first_line, second_line))
                        n_miss = len(kaptive_dic["Missing expected genes"].split(";")) - 1
                        
                        if kaptive_dic["Match confidence"] not in ["None", "Low"]:
                            outfile.write(f"{rep}\t{code_to_name.get(specie, 'Unknown')}\t"
                                          f"{kaptive_dic['Best match locus']}\t"
                                          f"{kaptive_dic['Match confidence']}\t"
                                          f"{n_miss}\n")
                            print(f"Processed: {rep}")
    except Exception as e:
        print(f"An error occurred: {e}")

def run_prokka_annotation(path_klebsiella: str, target_species: str):
    """
    Runs Prokka annotation for the target species.
    """
    try:
        for specie in os.listdir(path_klebsiella):
            specie_path = os.path.join(path_klebsiella, specie)
            if specie.startswith("k") and specie == target_species:
                db_init_path = os.path.join(specie_path, "Database_init")
                if not os.path.isdir(db_init_path):
                    print(f"Database_init not found for {specie}. Skipping...")
                    continue
                
                fasta_files = [f for f in os.listdir(db_init_path) if f.endswith(".fna")]
                for fasta in random.sample(fasta_files, len(fasta_files)):
                    rep = "_".join(fasta.split("_")[0:2])
                    path_out = os.path.join(specie_path, "refseq", "bacteria", rep)
                    prokka_out_path = os.path.join(path_out, "prokka_annotation")
                    
                    if not os.path.isdir(prokka_out_path):
                        os.makedirs(prokka_out_path, exist_ok=True)
                        prokka_command = [
                            "prokka", os.path.join(db_init_path, fasta),
                            "--norrna", "--notrna", "--outdir", prokka_out_path,
                            "--prefix", rep, "--compliant", "--force", "--cpus", "0"
                        ]
                        try:
                            subprocess.run(prokka_command, check=True, capture_output=True, text=True)
                            print(f"Prokka completed for {fasta}")
                        except subprocess.CalledProcessError as e:
                            print(f"Error running Prokka on {fasta}: {e.stderr}")
                    else:
                        print(f"Prokka exists for {rep}. Skipping...")
    except Exception as e:
        print(f"An error occurred: {e}")

def main(path_klebsiella: str, singularity_img_path: str):
    """
    Main function processing all species in k_specie.
    """
    k_specie = {
        "Klebsiella aerogenes": {"name": "k_aerogenes", "id": 548},
        "Klebsiella africana": {"name": "k_africana", "id": 2489010},
        "Klebsiella grimontii": {"name": "k_grimontii", "id": 2058152},
        "Klebsiella huaxiensis": {"name": "k_huaxiensis", "id": 2153354},
        "Klebsiella indica": {"name": "k_indica", "id": 2582917},
        "Klebsiella michiganensis": {"name": "k_michiganesis", "id": 1134687},
        "Klebsiella oxytoca": {"name": "k_oxytoca", "id": 571},
        "Klebsiella pasteurii": {"name": "k_pasteurii", "id": 2587529},
        "Klebsiella pneumoniae": {"name": "k_pneumoniae", "id": 573},
        "Klebsiella quasipneumoniae": {"name": "k_quasipneumoniae", "id": 1463165},
        "Klebsiella quasivariicola": {"name": "k_quasivariicola", "id": 2026240},
        "Klebsiella spallanzanii": {"name": "k_spallanzanii", "id": 2587528},
        "Klebsiella variicola": {"name": "k_variicola", "id": 244366},
    }
    
    code_to_name = {info['name']: species_name for species_name, info in k_specie.items()}
    
    try:
        print("Running Singularity for each species...")
        for species_name, info in k_specie.items():
            species_code = info['name']
            species_id = info['id']
            species_dir = os.path.join(path_klebsiella, species_code)
            os.makedirs(species_dir, exist_ok=True)
            run_singularity(singularity_img_path, species_id, species_dir)
        print("Singularity completed.\n")
        
        print("Running Kleborate...")
        run_kleborate_on_fasta(path_klebsiella)
        print("Kleborate completed.\n")
        
        print("Processing results...")
        process_kleborate_results(path_klebsiella, code_to_name)
        print("Results processed.\n")
        
        print("Running Prokka...")
        for species_code in code_to_name.keys():
            run_prokka_annotation(path_klebsiella, species_code)
        print("Prokka completed.\n")
    except Exception as e:
        print(f"Error in main: {e}")

if __name__ == "__main__":
    path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
    singularity_img_path = "/home/conchae/prediction_depolymerase_tropism/panacota.img"
    main(path_klebsiella, singularity_img_path)