# Preparing the Klebsiella genomes files for the phylogenetic tree
***

In [None]:
import os

def create_pangenome_list(path_klebsiella: str, species_of_interest: list, output_file: str):
    """
    Creates a list of good strains based on Kaptive confidence.
    """
    try:
        with open(output_file, "w") as outfile:
            for specie_dir in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie_dir)

                # Check if the directory is a species of interest
                if specie_dir in species_of_interest and os.path.isdir(specie_path):
                    refseq_path = os.path.join(specie_path, "refseq", "bacteria")
                    
                    if not os.path.exists(refseq_path):
                        print(f"Refseq directory missing for {specie_dir}. Skipping...")
                        continue
                    
                    for rep in os.listdir(refseq_path):
                        kaptive_out_file = os.path.join(refseq_path, rep, f"{rep}_Kaptive_out.txt")

                        if os.path.isfile(kaptive_out_file):
                            with open(kaptive_out_file, "r") as kaptive_out:
                                lines = kaptive_out.read().splitlines()
                                
                                if len(lines) >= 2:
                                    headers = lines[0].split("\t")
                                    values = lines[1].split("\t")
                                    kaptive_data = dict(zip(headers, values))
                                    
                                    confidence = kaptive_data.get("Match confidence", "None")
                                    if confidence not in ["None", "Low"]:
                                        outfile.write(f"{rep}\n")
                                        print(f"Added {rep}")
                        else:
                            print(f"Kaptive file missing for {rep}")
        print(f"Saved good strains to {output_file}")
    except Exception as e:
        print(f"Error creating pangenome list: {e}")


def create_protein_files(path_klebsiella: str, path_prot_files: str, good_strains_file: str):
    """
    Creates individual protein files for each good strain.
    """
    try:
        os.makedirs(path_prot_files, exist_ok=True)
        good_strains = open(good_strains_file).read().splitlines()

        for strain in good_strains:
            # Strain format: GCA_000123456.1
            # Find which species directory contains this strain
            found = False
            for specie_dir in os.listdir(path_klebsiella):
                specie_path = os.path.join(path_klebsiella, specie_dir)
                refseq_path = os.path.join(specie_path, "refseq", "bacteria", strain)
                
                if os.path.exists(refseq_path):
                    faa_path = os.path.join(refseq_path, "prokka_annotation", f"{strain}.faa")
                    
                    if os.path.isfile(faa_path):
                        with open(faa_path, "r") as faa_file, open(f"{path_prot_files}/{strain}.prt", "w") as outfile:
                            content = faa_file.read()
                            entries = content.split(">")[1:]  # Skip empty first element
                            
                            for idx, entry in enumerate(entries):
                                outfile.write(f">{strain}_{idx} {entry}\n")
                        print(f"Processed {strain}")
                        found = True
                        break
            if not found:
                print(f"Strain {strain} not found in any species directory")
    except Exception as e:
        print(f"Error creating protein files: {e}")


def create_combined_pangenome_file(path_klebsiella: str, good_strains_file: str, combined_output_file: str):
    """
    Combines all protein sequences into a single file.
    """
    try:
        good_strains = open(good_strains_file).read().splitlines()

        with open(combined_output_file, "w") as outfile:
            for strain in good_strains:
                # Find the strain's FAA file
                for specie_dir in os.listdir(path_klebsiella):
                    strain_path = os.path.join(path_klebsiella, specie_dir, "refseq", "bacteria", strain)
                    
                    if os.path.exists(strain_path):
                        faa_file = os.path.join(strain_path, "prokka_annotation", f"{strain}.faa")
                        
                        if os.path.isfile(faa_file):
                            with open(faa_file, "r") as f:
                                entries = f.read().split(">")[1:]
                                
                                for idx, entry in enumerate(entries):
                                    outfile.write(f">{strain}_{idx} {entry}\n")
                            print(f"Added {strain}")
                            break
        print(f"Combined pangenome saved to {combined_output_file}")
    except Exception as e:
        print(f"Error creating combined file: {e}")


# Updated to match the structure from the first script
k_specie = {
    "Klebsiella aerogenes": {"name": "k_aerogenes", "id": 548},
    "Klebsiella africana": {"name": "k_africana", "id": 2489010},
    "Klebsiella grimontii": {"name": "k_grimontii", "id": 2058152},
    "Klebsiella huaxiensis": {"name": "k_huaxiensis", "id": 2153354},
    "Klebsiella indica": {"name": "k_indica", "id": 2582917},
    "Klebsiella michiganensis": {"name": "k_michiganesis", "id": 1134687},
    "Klebsiella oxytoca": {"name": "k_oxytoca", "id": 571},
    "Klebsiella pasteurii": {"name": "k_pasteurii", "id": 2587529},
    "Klebsiella pneumoniae": {"name": "k_pneumoniae", "id": 573},
    "Klebsiella quasipneumoniae": {"name": "k_quasipneumoniae", "id": 1463165},
    "Klebsiella quasivariicola": {"name": "k_quasivariicola", "id": 2026240},
    "Klebsiella spallanzanii": {"name": "k_spallanzanii", "id": 2587528},
    "Klebsiella variicola": {"name": "k_variicola", "id": 244366},
}


def main():
    path_klebsiella = "/home/conchae/prediction_depolymerase_tropism"
    path_prot_files = os.path.join(path_klebsiella, "panacota_pangenome", "protein_files")
    good_strains_file = os.path.join(path_klebsiella, "panacota_pangenome", "panacota_pangenome_list_v2.1.txt")
    combined_output_file = os.path.join(path_klebsiella, "pangenome_klebsiella_all.prt")

    # Get species codes (e.g., "k_aerogenes")
    species_of_interest = [info["name"] for info in k_specie.values()]

    print("=== Creating pangenome list ===")
    create_pangenome_list(path_klebsiella, species_of_interest, good_strains_file)

    print("\n=== Creating protein files ===")
    create_protein_files(path_klebsiella, path_prot_files, good_strains_file)

    print("\n=== Creating combined pangenome ===")
    create_combined_pangenome_file(path_klebsiella, good_strains_file, combined_output_file)


if __name__ == "__main__":
    main()