# Preparing the Klebsiella genomes files for the phylogenetic tree
***

In [None]:
import os


def create_pangenome_list(path_klebsiella: str, species_of_interest: list, output_file: str):
    """
    Creates a list of good strains based on the Kaptive match confidence.
    
    Parameters:
    path_klebsiella : The base directory containing Klebsiella species data.
    species_of_interest : List of species to be included in the analysis.
    output_file : Path to save the filtered strains.
    
    Returns:
    None: Writes the list of good strains to the output file.
    """
    try:
        with open(output_file, "w") as outfile:
            for specie in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie)

                # Only process species in the species_of_interest list
                if specie in species_of_interest and os.path.isdir(specie_path):
                    refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                    
                    for rep in os.listdir(refseq_path):
                        kaptive_out_file = os.path.join(refseq_path, rep, f"{rep}_Kaptive_out.txt")

                        if os.path.isfile(kaptive_out_file):
                            with open(kaptive_out_file, "r") as kaptive_out:
                                kaptive_lines = kaptive_out.read().split("\n")
                                
                                # Create a dictionary from Kaptive output
                                if len(kaptive_lines) > 1:
                                    first_line = kaptive_lines[0].split("\t")
                                    second_line = kaptive_lines[1].split("\t")
                                    kaptive_dic = dict(zip(first_line, second_line))
                                    
                                    # Check for medium or high confidence matches
                                    if kaptive_dic.get("Match confidence") not in ["None", "Low"]:
                                        outfile.write(f"{rep}\n")
                                        print(f"Added {rep} to the good strains list")
                        else:
                            print(f"Kaptive output not found for {rep}")
        print(f"Strains written to {output_file}")
    except Exception as e:
        print(f"An error occurred while creating the pangenome list: {e}")


def create_protein_files(path_klebsiella: str, path_prot_files: str, good_strains_file: str):
    """
    Creates individual protein files for good strains.

    Parameters:
    path_klebsiella : The base directory containing Klebsiella species data.
    path_prot_files : The output directory for saving the protein files.
    good_strains_file : Path to the file containing the list of good strains.
    
    Returns:
    None: Writes protein sequences to individual files.
    """
    try:
        good_strains = open(good_strains_file).read().splitlines()

        for specie in os.listdir(path_klebsiella):
            specie_path = os.path.join(path_klebsiella, specie)

            if specie.startswith("k") and os.path.isdir(specie_path):
                refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                
                for strain in os.listdir(refseq_path):
                    if strain in good_strains:
                        faa_file_path = os.path.join(refseq_path, strain, "prokka_annotation_all", f"{strain}.faa")

                        if os.path.isfile(faa_file_path):
                            with open(faa_file_path, "r") as faa_file, open(f"{path_prot_files}/{strain}.prt", "w") as outfile:
                                sequences = faa_file.read().split(">")[1:]

                                for index, seq in enumerate(sequences):
                                    outfile.write(f">{strain}_{index} {seq}\n")
                                    print(f"Wrote sequence {index} for strain {strain}")
                        else:
                            print(f"FAA file not found for {strain}")
        print(f"Protein files created in {path_prot_files}")
    except Exception as e:
        print(f"An error occurred while creating protein files: {e}")


def create_combined_pangenome_file(path_klebsiella: str, good_strains_file: str, combined_output_file: str):
    """
    Combines all protein sequences into a single pangenome file.

    Parameters:
    path_klebsiella : The base directory containing Klebsiella species data.
    good_strains_file : Path to the file containing the list of good strains.
    combined_output_file : The output file to save combined protein sequences.
    
    Returns:
    None: Writes all protein sequences to the combined output file.
    """
    try:
        good_strains = open(good_strains_file).read().splitlines()

        with open(combined_output_file, "w") as outfile:
            for specie in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie)

                if specie.startswith("k") and os.path.isdir(specie_path):
                    refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                    
                    for strain in os.listdir(refseq_path):
                        if strain in good_strains:
                            faa_file_path = os.path.join(refseq_path, strain, "prokka_annotation_all", f"{strain}.faa")

                            if os.path.isfile(faa_file_path):
                                with open(faa_file_path, "r") as faa_file:
                                    sequences = faa_file.read().split(">")[1:]

                                    for index, seq in enumerate(sequences):
                                        outfile.write(f">{strain}_{index} {seq}\n")
                                        print(f"Added sequence {index} for strain {strain}")
        print(f"Combined pangenome file written to {combined_output_file}")
    except Exception as e:
        print(f"An error occurred while creating the combined pangenome file: {e}")


# Dictionary mapping Klebsiella species codes to full species names
k_specie = {
    "k_aerogenes": "Klebsiella aerogenes",
    "k_africana": "Klebsiella africana",
    "k_grimatii": "Klebsiella grimontii",
    "k_huaxiensis": "Klebsiella huaxiensis",
    "k_indica": "Klebsiella indica",
    "k_michiganesis": "Klebsiella michiganensis",
    "k_oxytoca": "Klebsiella oxytoca",
    "k_pasteurii": "Klebsiella pasteurii",
    "k_pneumoniae": "Klebsiella pneumoniae",
    "k_quasipneumoniae": "Klebsiella quasipneumoniae",
    "k_quasivariicola": "Klebsiella quasivariicola",
    "k_spallanzanii": "Klebsiella spallanzanii",
    "k_variicola": "Klebsiella variicola"
}


def main():
    """
    Main function to run the Panacota pangenome step for Klebsiella species.
    """
    path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
    path_prot_files = os.path.join(path_klebsiella, "panacota_pangenome", "protein_files")
    good_strains_file = os.path.join(path_klebsiella, "panacota_pangenome", "panacota_pangenome_list_v2.1.txt")
    combined_output_file = os.path.join(path_klebsiella, "pangenome_klebsiella_all.prt")

    # List of species to consider for the pangenome
    species_of_interest = list(k_specie.keys())

    # Step 1: Create the pangenome list of good strains
    print("Creating the pangenome list...")
    create_pangenome_list(path_klebsiella, species_of_interest, good_strains_file)

    # Step 2: Create individual protein files for good strains
    print("Creating individual protein files...")
    create_protein_files(path_klebsiella, path_prot_files, good_strains_file)

    # Step 3: Create a combined pangenome file with all the protein sequences
    print("Creating the combined pangenome file...")
    create_combined_pangenome_file(path_klebsiella, good_strains_file, combined_output_file)


if __name__ == "__main__":
    main()