# Import statements

In [1]:
from pathlib import Path
from helpers import *

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

In [2]:
""" ============================================ DO NOT EDIT BELOW ============================================ """
""" environment variable extraction """
try:
    VARS_SET
except NameError:
    VARS_SET = True
    _cwd = %pwd
    _parent_cwd = Path(_cwd).parent
    _start_path = %env PATH
        
%env PATH=/usr/bin:$_start_path

env: PATH=/usr/bin:/home/youn/mambaforge/envs/chronostrain2/bin


# Important global vars

IMPORTANT to edit these!

In [4]:
gene_basedir = Path().resolve().parent.parent / "data" / "chronostrain_seeds" / "Escherichia_coli"  # the directory to store the gene files and marker index file.
refseq_metadata_description = "Escherichia coli K12 MG1655"  # This goes into the metadata of the FASTA records.
mlst_id_prefix = "E_Coli"


reference_genome_fasta = Path() / "E_Coli_Reference.fasta"
assert reference_genome_fasta.exists()

# Extract MLST typing genes.

As of 2024 Sept. 9th, E. coli has the Pasteur+Actman scheme. https://pubmlst.org/bigsdb?db=pubmlst_mlst_seqdef

In [13]:
paper_marker_dir = Path().resolve().parent.parent / "data" / "chronostrain_seeds" / "Escherichia_coli_custom"
paper_df = pd.read_csv(paper_marker_dir / "marker_seed_index.tsv", sep='\t', header=None)
paper_mlst_slice = paper_df.loc[paper_df[2].str.startswith("MLST")]
paper_mlst_slice

Unnamed: 0,0,1,2
0,adk,Escherichia_coli_1_adk.fasta,MLST:Escherichia_coli_1
1,fumC,Escherichia_coli_1_fumC.fasta,MLST:Escherichia_coli_1
2,gyrB,Escherichia_coli_1_gyrB.fasta,MLST:Escherichia_coli_1
3,icd,Escherichia_coli_1_icd.fasta,MLST:Escherichia_coli_1
4,mdh,Escherichia_coli_1_mdh.fasta,MLST:Escherichia_coli_1
5,purA,Escherichia_coli_1_purA.fasta,MLST:Escherichia_coli_1
6,recA,Escherichia_coli_1_recA.fasta,MLST:Escherichia_coli_1
7,dinB,Escherichia_coli_2_dinB.fasta,MLST:Escherichia_coli_2
8,icdA,Escherichia_coli_2_icdA.fasta,MLST:Escherichia_coli_2
9,pabB,Escherichia_coli_2_pabB.fasta,MLST:Escherichia_coli_2


In [14]:
import shutil

mlst_index = []
for _, row in paper_mlst_slice.iterrows():
    gene_name = row[0]
    gene_path = paper_marker_dir / row[1]
    filename = f'{gene_name}.fasta'
    shutil.copy(gene_path, gene_basedir / filename)
    mlst_index.append({'Name': f'{gene_name}', 'Fasta': filename, 'Metadata': 'MLST'})
mlst_index = pd.DataFrame(mlst_index)
display(mlst_index)

Unnamed: 0,Name,Fasta,Metadata
0,adk,adk.fasta,MLST
1,fumC,fumC.fasta,MLST
2,gyrB,gyrB.fasta,MLST
3,icd,icd.fasta,MLST
4,mdh,mdh.fasta,MLST
5,purA,purA.fasta,MLST
6,recA,recA.fasta,MLST
7,dinB,dinB.fasta,MLST
8,icdA,icdA.fasta,MLST
9,pabB,pabB.fasta,MLST


# Extract MetaPhlAn markers.

In [17]:
taxon_label = 's__Escherichia_coli'
metaphlan_version_id = "mpa_vJun23_CHOCOPhlAnSGB_202307"
metaphlan_pkl_path = Path(f"/mnt/e/metaphlan_databases/{metaphlan_version_id}/{metaphlan_version_id}.pkl")

In [18]:
def extract_from_metaphlan(input_metaphlan_pkl: Path):
    parser = MetaphlanParser(input_metaphlan_pkl)

    # Extract reference seqs
    metaphlan_gene_records = []
    for marker_name, record in parser.retrieve_marker_seeds(taxon_label):
        marker_len = len(record.seq)
        print(f"Found marker `{marker_name}` (length {marker_len})")
        metaphlan_gene_records.append(record)
    return metaphlan_gene_records

In [19]:
metaphlan_genes = extract_from_metaphlan(metaphlan_pkl_path)

Searching for marker seeds from MetaPhlAn database: mpa_vJun23_CHOCOPhlAnSGB_202307.
Target # of markers: 136
Found marker `UniRef90_P75933|1__4|SGB10068` (length 600)
Found marker `UniRef90_Q0T2M6|4__9|SGB10068` (length 800)
Found marker `UniRef90_P33916|5__8|SGB10068` (length 500)
Found marker `UniRef90_P25718|1__4|SGB10068` (length 550)
Found marker `UniRef90_Q8XAU9|1__3|SGB10068` (length 450)
Found marker `UniRef90_P52614|1__4|SGB10068` (length 550)
Found marker `UniRef90_P25798|4__8|SGB10068` (length 650)
Found marker `UniRef90_P0AFI1|2__6|SGB10068` (length 650)
Found marker `UniRef90_P07017|2__6|SGB10068` (length 650)
Found marker `UniRef90_B7UQY2|3__6|SGB10068` (length 500)
Found marker `UniRef90_P21362|1__3|SGB10068` (length 450)
Found marker `UniRef90_Q47141|1__5|SGB10068` (length 750)
Found marker `UniRef90_P0ABR6|1__9|SGB10068` (length 1350)
Found marker `UniRef90_P76254|1__5|SGB10068` (length 700)
Found marker `UniRef90_Q8XA71|5__8|SGB10068` (length 550)
Found marker `UniRe

In [23]:
# Write each individual MetaPhlAn gene to fasta.
metaphlan_index = []
for metaphlan_gene in metaphlan_genes:
    record = SeqRecord(
        seq=metaphlan_gene.seq,
        id=metaphlan_gene.id,
        description=f"MLST {taxon_label} marker gene version {metaphlan_version_id}"
    )

    uniref_id, middle_tag, sgb_id = metaphlan_gene.id.split("|")
    # Turn this into something posix-friendly.
    filename = f'{uniref_id}-{middle_tag}-{sgb_id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    metaphlan_index.append({'Name': f'{uniref_id}-{middle_tag}-{sgb_id}', 'Fasta': filename, 'Metadata': f'MetaPhlAn:{metaphlan_version_id}'})

metaphlan_index = pd.DataFrame(metaphlan_index)

In [24]:
metaphlan_index

Unnamed: 0,Name,Fasta,Metadata
0,UniRef90_P75933-1__4-SGB10068,UniRef90_P75933-1__4-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
1,UniRef90_Q0T2M6-4__9-SGB10068,UniRef90_Q0T2M6-4__9-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
2,UniRef90_P33916-5__8-SGB10068,UniRef90_P33916-5__8-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
3,UniRef90_P25718-1__4-SGB10068,UniRef90_P25718-1__4-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
4,UniRef90_Q8XAU9-1__3-SGB10068,UniRef90_Q8XAU9-1__3-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
...,...,...,...
131,UniRef90_P32139-1__6-SGB10068,UniRef90_P32139-1__6-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
132,UniRef90_P30855-6__14-SGB10068,UniRef90_P30855-6__14-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
133,UniRef90_P75908-6__9-SGB10068,UniRef90_P75908-6__9-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
134,UniRef90_P37627-6__9-SGB10068,UniRef90_P37627-6__9-SGB10068.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307


# Write the index files.

In [25]:
concat_index = pd.concat([mlst_index, metaphlan_index])
concat_index.to_csv(gene_basedir / "marker_seed_index.tsv", sep='\t', index=False, header=False)