# Import statements

In [2]:
from pathlib import Path
from helpers import *

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

In [3]:
""" ============================================ DO NOT EDIT BELOW ============================================ """
""" environment variable extraction """
try:
    VARS_SET
except NameError:
    VARS_SET = True
    _cwd = %pwd
    _parent_cwd = Path(_cwd).parent
    _start_path = %env PATH
        
%env PATH=/usr/bin:$_start_path

env: PATH=/usr/bin:/home/youn/mambaforge/envs/chronostrain2/bin


# Important global vars

In [4]:
gene_basedir = Path().resolve().parent.parent / "data" / "chronostrain_seeds" / "Klebsiella_pneumoniae"  # the directory to store the gene files and marker index file.
refseq_metadata_description = "Klebsiella pneumoniae HS11286"  # This goes into the metadata of the FASTA records.
mlst_id_prefix = "K_pneumoniae"


reference_genome_fasta = Path() / "K_Pneumoniae_Reference.fasta"
assert reference_genome_fasta.exists()

# Extract MLST typing genes.

As of 2024 Sept. 9th, K. pneumoniae has the Pasteur scheme. https://pubmlst.org/bigsdb?db=pubmlst_mlst_seqdef

In [7]:
mlst_fasta = Path() / "K_Pneumoniae_Reference_MLST.fasta"
mlst_index = []

for record in SeqIO.parse(mlst_fasta, "fasta"):
    filename = f'{record.id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    mlst_index.append({'Name': f'{record.id}', 'Fasta': filename, 'Metadata': 'MLST'})
mlst_index = pd.DataFrame(mlst_index)

In [8]:
mlst_index

Unnamed: 0,Name,Fasta,Metadata
0,gapA,gapA.fasta,MLST
1,infB,infB.fasta,MLST
2,mdh,mdh.fasta,MLST
3,pgi,pgi.fasta,MLST
4,phoE,phoE.fasta,MLST
5,rpoB,rpoB.fasta,MLST
6,tonB,tonB.fasta,MLST


# Extract MetaPhlAn markers.

In [9]:
taxon_label = 's__Klebsiella_pneumoniae'
metaphlan_version_id = "mpa_vJun23_CHOCOPhlAnSGB_202307"
metaphlan_pkl_path = Path(f"/mnt/e/metaphlan_databases/{metaphlan_version_id}/{metaphlan_version_id}.pkl")

In [10]:
def extract_from_metaphlan(input_metaphlan_pkl: Path):
    parser = MetaphlanParser(input_metaphlan_pkl)

    # Extract reference seqs
    metaphlan_gene_records = []
    for marker_name, record in parser.retrieve_marker_seeds(taxon_label):
        marker_len = len(record.seq)
        print(f"Found marker `{marker_name}` (length {marker_len})")
        metaphlan_gene_records.append(record)
    return metaphlan_gene_records

In [11]:
metaphlan_genes = extract_from_metaphlan(metaphlan_pkl_path)

Searching for marker seeds from MetaPhlAn database: mpa_vJun23_CHOCOPhlAnSGB_202307.
Target # of markers: 31
Found marker `UniRef90_A0A0E1C8Z8|1__3|SGB10115` (length 450)
Found marker `UniRef90_W9BMW7|1__3|SGB10115` (length 450)
Found marker `UniRef90_A0A1C1SAV0|1__7|SGB10115` (length 1050)
Found marker `UniRef90_W9B8W5|1__4|SGB10115` (length 600)
Found marker `UniRef90_A0A0C7KAK3|7__11|SGB10115` (length 700)
Found marker `UniRef90_A0A0C7KE20|1__5|SGB10115` (length 750)
Found marker `UniRef90_A0A1D3KQ48|1__3|SGB10115` (length 450)
Found marker `UniRef90_A0A0C7KAK3|1__4|SGB10115` (length 550)
Found marker `UniRef90_A0A086IAA7|1__6|SGB10115` (length 900)
Found marker `UniRef90_A0A2X1QC44|1__4|SGB10115` (length 600)
Found marker `UniRef90_A0A1D3U7D5|2__5|SGB10115` (length 550)
Found marker `UniRef90_A0A447WF26|1__5|SGB10115` (length 700)
Found marker `UniRef90_A0A3S5DTC2|5__9|SGB10115` (length 650)
Found marker `UniRef90_A0A0C7KIL4|8__15|SGB10115` (length 1100)
Found marker `UniRef90_A0A2

In [12]:
# Write each individual MetaPhlAn gene to fasta.
metaphlan_index = []
for metaphlan_gene in metaphlan_genes:
    record = SeqRecord(
        seq=metaphlan_gene.seq,
        id=metaphlan_gene.id,
        description=f"MLST {taxon_label} marker gene version {metaphlan_version_id}"
    )

    uniref_id, middle_tag, sgb_id = metaphlan_gene.id.split("|")
    # Turn this into something posix-friendly.
    filename = f'{uniref_id}-{middle_tag}-{sgb_id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    metaphlan_index.append({'Name': f'{uniref_id}-{middle_tag}-{sgb_id}', 'Fasta': filename, 'Metadata': f'MetaPhlAn {metaphlan_version_id}'})

metaphlan_index = pd.DataFrame(metaphlan_index)

In [18]:
metaphlan_index #.loc[metaphlan_index['Name'] == 'UniRef90_W9B8W5-1__4-SGB10115']

Unnamed: 0,Name,Fasta,Metadata
0,UniRef90_A0A0E1C8Z8-1__3-SGB10115,UniRef90_A0A0E1C8Z8-1__3-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
1,UniRef90_W9BMW7-1__3-SGB10115,UniRef90_W9BMW7-1__3-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
2,UniRef90_A0A1C1SAV0-1__7-SGB10115,UniRef90_A0A1C1SAV0-1__7-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
3,UniRef90_W9B8W5-1__4-SGB10115,UniRef90_W9B8W5-1__4-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
4,UniRef90_A0A0C7KAK3-7__11-SGB10115,UniRef90_A0A0C7KAK3-7__11-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
5,UniRef90_A0A0C7KE20-1__5-SGB10115,UniRef90_A0A0C7KE20-1__5-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
6,UniRef90_A0A1D3KQ48-1__3-SGB10115,UniRef90_A0A1D3KQ48-1__3-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
7,UniRef90_A0A0C7KAK3-1__4-SGB10115,UniRef90_A0A0C7KAK3-1__4-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
8,UniRef90_A0A086IAA7-1__6-SGB10115,UniRef90_A0A086IAA7-1__6-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
9,UniRef90_A0A2X1QC44-1__4-SGB10115,UniRef90_A0A2X1QC44-1__4-SGB10115.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307


# Write the index files.

In [16]:
concat_index = pd.concat([mlst_index, metaphlan_index])
concat_index.to_csv(gene_basedir / "marker_seed_index.tsv", sep='\t', index=False, header=False)