# Import statements

In [1]:
from pathlib import Path
from helpers import *

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

In [2]:
""" ============================================ DO NOT EDIT BELOW ============================================ """
""" environment variable extraction """
try:
    VARS_SET
except NameError:
    VARS_SET = True
    _cwd = %pwd
    _parent_cwd = Path(_cwd).parent
    _start_path = %env PATH
        
%env PATH=/usr/bin:$_start_path

env: PATH=/usr/bin:/home/youn/mambaforge/envs/chronostrain2/bin


# Important global vars

IMPORTANT to edit these!

In [3]:
gene_basedir = Path().resolve().parent.parent / "data" / "chronostrain_seeds" / "Enterococcus_faecium"  # the directory to store the gene files and marker index file.
refseq_metadata_description = "Enterococcus faecium SRR24"  # This goes into the metadata of the FASTA records.
mlst_id_prefix = "E_Faecium"


reference_genome_fasta = Path() / "E_Faecium_Reference.fasta"
assert reference_genome_fasta.exists()

# Extract MLST typing genes.

## Enterococcus faecium MLST genes

To save some time/ambiguity (the MLST website has hundreds of sites but the official XML schema only lists 7 genes), we'll use the pre-generated FASTA file from the MLST official locus schema file.

In [7]:
mlst_fasta = Path() / "E_Faecium_Schema_MLST.fasta"
mlst_index = []

for record in SeqIO.parse(mlst_fasta, "fasta"):
    filename = f'{record.id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    mlst_index.append({'Name': f'{record.id}', 'Fasta': filename, 'Metadata': 'MLST'})
mlst_index = pd.DataFrame(mlst_index)

In [8]:
display(mlst_index)

Unnamed: 0,Name,Fasta,Metadata
0,adk,adk.fasta,MLST
1,atpA,atpA.fasta,MLST
2,ddl,ddl.fasta,MLST
3,gdh,gdh.fasta,MLST
4,gyd,gyd.fasta,MLST
5,pstS,pstS.fasta,MLST
6,purK,purK.fasta,MLST


# Extract MetaPhlAn markers.

In [9]:
taxon_label = 's__Enterococcus_faecium'
metaphlan_version_id = "mpa_vJun23_CHOCOPhlAnSGB_202307"
metaphlan_pkl_path = Path(f"/mnt/e/metaphlan_databases/{metaphlan_version_id}/{metaphlan_version_id}.pkl")

In [10]:
def extract_from_metaphlan(input_metaphlan_pkl: Path):
    parser = MetaphlanParser(input_metaphlan_pkl)

    # Extract reference seqs
    metaphlan_gene_records = []
    for marker_name, record in parser.retrieve_marker_seeds(taxon_label):
        marker_len = len(record.seq)
        print(f"Found marker `{marker_name}` (length {marker_len})")
        metaphlan_gene_records.append(record)
    return metaphlan_gene_records

In [11]:
metaphlan_genes = extract_from_metaphlan(metaphlan_pkl_path)

Searching for marker seeds from MetaPhlAn database: mpa_vJun23_CHOCOPhlAnSGB_202307.
Target # of markers: 166
Found marker `UniRef90_S4DR35|1__4|SGB7967` (length 550)
Found marker `UniRef90_A0A132PAA3|1__3|SGB7967` (length 450)
Found marker `UniRef90_A0A132Z5H9|1__7|SGB7967` (length 1050)
Found marker `UniRef90_A0A132Z370|8__13|SGB7967` (length 800)
Found marker `UniRef90_A0A449FVC2|3__6|SGB7967` (length 500)
Found marker `UniRef90_A0A132P5M5|1__5|SGB7967` (length 700)
Found marker `UniRef90_A0A132Z5C0|1__5|SGB7967` (length 700)
Found marker `UniRef90_A0A133MT97|3__6|SGB7967` (length 500)
Found marker `UniRef90_A0A132Z349|1__4|SGB7967` (length 550)
Found marker `UniRef90_A0A449F934|1__7|SGB7967` (length 1050)
Found marker `UniRef90_A0A286V0J6|2__5|SGB7967` (length 500)
Found marker `UniRef90_A0A133CH55|1__3|SGB7967` (length 450)
Found marker `UniRef90_S4F8R3|1__12|SGB7967` (length 1750)
Found marker `UniRef90_UPI0008A587DC|4__7|SGB7967` (length 500)
Found marker `UniRef90_A0A132Z1X2|1_

In [15]:
# Write each individual MetaPhlAn gene to fasta.
metaphlan_index = []
for metaphlan_gene in metaphlan_genes:
    record = SeqRecord(
        seq=metaphlan_gene.seq,
        id=metaphlan_gene.id,
        description=f"MLST {taxon_label} marker gene version {metaphlan_version_id}"
    )

    uniref_id, middle_tag, sgb_id = metaphlan_gene.id.split("|")
    # Turn this into something posix-friendly.
    filename = f'{uniref_id}-{middle_tag}-{sgb_id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    metaphlan_index.append({'Name': f'{uniref_id}-{middle_tag}-{sgb_id}', 'Fasta': filename, 'Metadata': f'MetaPhlAn:{metaphlan_version_id}'})

metaphlan_index = pd.DataFrame(metaphlan_index)

In [16]:
metaphlan_index

Unnamed: 0,Name,Fasta,Metadata
0,UniRef90_S4DR35-1__4-SGB7967,UniRef90_S4DR35-1__4-SGB7967.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
1,UniRef90_A0A132PAA3-1__3-SGB7967,UniRef90_A0A132PAA3-1__3-SGB7967.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
2,UniRef90_A0A132Z5H9-1__7-SGB7967,UniRef90_A0A132Z5H9-1__7-SGB7967.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
3,UniRef90_A0A132Z370-8__13-SGB7967,UniRef90_A0A132Z370-8__13-SGB7967.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
4,UniRef90_A0A449FVC2-3__6-SGB7967,UniRef90_A0A449FVC2-3__6-SGB7967.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
...,...,...,...
161,UniRef90_A0A133N557-1__7-SGB7968,UniRef90_A0A133N557-1__7-SGB7968.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
162,UniRef90_J8ZGM0-2__9-SGB7968,UniRef90_J8ZGM0-2__9-SGB7968.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
163,UniRef90_A0A133MTM1-4__7-SGB7968,UniRef90_A0A133MTM1-4__7-SGB7968.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307
164,UniRef90_A0A2A7SXH8-1__5-SGB7968,UniRef90_A0A2A7SXH8-1__5-SGB7968.fasta,MetaPhlAn:mpa_vJun23_CHOCOPhlAnSGB_202307


# Write the index files.

In [17]:
concat_index = pd.concat([mlst_index, metaphlan_index])
concat_index.to_csv(gene_basedir / "marker_seed_index.tsv", sep='\t', index=False, header=False)