In [None]:
import os
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from pyphylon.util import load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
SPECIES = CONFIG["PG_NAME"]
temp_folder = CONFIG.get("REUSE_TEMP_DIR", "../temp/")
data_dir = CONFIG.get("SNAKEMAKE_DATA_DIR", "data/")

In [None]:
MLST = os.path.join(data_dir, 'processed/mlst_report.txt')
METADATA = os.path.join(temp_folder, '2b_genome_metadata.csv')

In [None]:
mlst = pd.read_csv(MLST, sep='\t', header=None, dtype='object')

# Add column names
mlst.columns = [
    'genome_id',
    'schema',
    'mlst',
    'allele1',
    'allele2',
    'allele3',
    'allele4',
    'allele5',
    'allele6',
    'allele7']

mlst['genome_id'] = mlst['genome_id'].apply(lambda x: os.path.basename(x).replace('.fna', ''))
mlst

# Enrich metadata

For now, its just MLST. Add in other things as needed

In [None]:
mash_scrubbed_metadata = pd.read_csv(METADATA, index_col=0, dtype='object')

display(
    mash_scrubbed_metadata.shape,
    mash_scrubbed_metadata.head()
)
# temp save mash scrubbed as enriched even tho we dont have mlst:
#mash_scrubbed_metadata.to_csv('/mnt/craig/pan_phylon/Klebsiella/metadata/enriched_metadata.csv')

In [None]:
mash_scrubbed_metadata['mlst'] = None

for idx in tqdm(mash_scrubbed_metadata.index):
    genome_id = mash_scrubbed_metadata.loc[idx, 'genome_id']
    mlst_value = mlst.set_index('genome_id').loc[f'{genome_id}', 'mlst']

    # if non-exact mlst allele match, set to -1
    if mlst_value == '-':
        mlst_value = -1
    
    mash_scrubbed_metadata.loc[idx, 'mlst'] = mlst_value

mash_scrubbed_metadata.head()

In [None]:
mash_scrubbed_metadata

In [None]:
mash_scrubbed_metadata.to_csv(os.path.join(temp_folder, '2d_enriched_metadata.csv'))