In [None]:
import os

import pandas as pd
from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

plt.rcParams["figure.dpi"] = 200
sns.set_palette("deep")
sns.set_context("paper")
sns.set_style("whitegrid")

from pyphylon.util import load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]
temp_folder = CONFIG.get("REUSE_TEMP_DIR", "../temp/")
data_dir = CONFIG.get("SNAKEMAKE_DATA_DIR", "data/")

In [None]:
mash_scrubbed_metadata = pd.read_csv(
    os.path.join(temp_folder, '2b_genome_metadata.csv'),
    index_col=0, dtype='object')

display(
    mash_scrubbed_metadata.shape,
    mash_scrubbed_metadata.head()
)

In [None]:
BAKTA = os.path.join(data_dir, 'processed/bakta/')

bakta_faa_paths = [
    os.path.join(BAKTA, bakta_folder, bakta_folder + '.faa')
    for bakta_folder in os.listdir(BAKTA)
    if os.path.isdir(os.path.join(BAKTA, bakta_folder))
]

bakta_faa_paths[:5]

In [None]:
# Sanity check
for path in tqdm(bakta_faa_paths):
    assert os.path.isfile(path)

In [None]:
genome_id_set = set(mash_scrubbed_metadata['genome_id'].astype(str))

real_paths = [
    f for f in bakta_faa_paths
    if os.path.basename(os.path.dirname(f)) in genome_id_set
]

print(f"Matched {len(real_paths)} BAKTA paths to metadata genomes.")
real_paths[:5]

In [None]:
len(real_paths)

In [None]:
from pyphylon.pangenome import build_cds_pangenome

cdhit_output_dir = os.path.join(data_dir, 'processed/cd-hit-results/')
os.makedirs(cdhit_output_dir, exist_ok=True)

df_alleles, df_genes, header_to_allele = build_cds_pangenome(
    genome_faa_paths=real_paths,
    output_dir=cdhit_output_dir,
    name=SPECIES,
    cdhit_args={'-n': 5, '-c': 0.8, '-aL': 0.8, '-T': 0, '-M': 0},
    fastasort_path=None,
    save_csv=False
)

In [None]:
df_genes.sum()

In [None]:
sns.clustermap(df_genes.fillna(0).transpose())