# Import statements

In [21]:
from pathlib import Path
from helpers import *

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

In [11]:
""" ============================================ DO NOT EDIT BELOW ============================================ """
""" environment variable extraction """
try:
    VARS_SET
except NameError:
    VARS_SET = True
    _cwd = %pwd
    _parent_cwd = Path(_cwd).parent
    _start_path = %env PATH
        
%env PATH=/usr/bin:$_start_path

env: PATH=/usr/bin:/home/youn/mambaforge/envs/chronostrain2/bin


# Important global vars

In [36]:
gene_basedir = Path().resolve().parent.parent / "data" / "chronostrain_seeds" / "Staphylococcus_aureus"  # the directory to store the gene files and marker index file.
refseq_metadata_description = "Staphylococcus aureus CP000253.1 NCTC 8325"  # This goes into the metadata of the FASTA records.
mlst_id_prefix = "S_Aureus"


reference_genome_fasta = Path() / "S_Aureus_Reference.fasta"
assert reference_genome_fasta.exists()

# Extract MLST typing genes.

## Staphylococcus aureus MLST primers
https://pubmlst.org/organisms/staphylococcus-aureus/primers

The S. aureus MLST scheme uses internal fragments of the following seven house-keeping genes:

    arcC (Carbamate kinase)
    aroE (Shikimate dehydrogenase)
    glpF (Glycerol kinase)
    gmk (Guanylate kinase)
    pta (Phosphate acetyltransferase)
    tpi (Triosephosphate isomerase)
    yqi (Acetyle coenzyme A acetyltransferase)

PCR/Sequencing primers

    arc up - 5' TTGATTCACCAGCGCGTATTGTC -3'
    arc dn - 5' AGGTATCTGCTTCAATCAGCG -3'
    aro up - 5' ATCGGAAATCCTATTTCACATTC -3'
    aro dn - 5' GGTGTTGTATTAATAACGATATC -3'
    glp up - 5' CTAGGAACTGCAATCTTAATCC -3'
    glp dn - 5' TGGTAAAATCGCATGTCCAATTC -3'
    gmk up - 5' ATCGTTTTATCGGGACCATC -3'
    gmk dn - 5' TCATTAACTACAACGTAATCGTA -3'
    pta up - 5' GTTAAAATCGTATTACCTGAAGG -3'
    pta dn - 5' GACCCTTTTGTTGAAAAGCTTAA -3'
    tpi up - 5' TCGTTCATTCTGAACGTCGTGAA -3'
    tpi dn - 5' TTTGCACCTTCTAACAATTGTAC -3'
    yqi up- 5' CAGCATACAGGACACCTATTGGC -3'
    yqi dn- 5' CGTTGAGGAATCGATACTGGAAC -3'

## Code

In [2]:
!primersearch --version

EMBOSS:6.6.0.0


In [5]:
def perform_primer_search(gene_name: str, forward_primer: str, rev_primer: str, tmp_basedir: Path):
    tmp_dir=tmp_basedir / mlst_id_prefix / gene_name
    tmp_dir.mkdir(exist_ok=True, parents=True)
    return get_primerhit_as_gene(
        chrom_path=reference_genome_fasta,
        cluster_name=gene_name,
        primer1=forward_primer,
        primer2=rev_primer,
        mismatch_pct=5,
        tmp_dir=tmp_dir
    )


# ====== Known strain polymorphisms
mlst_genes = []
tmp_basedir = Path() / "__tmp"
mlst_genes.append(perform_primer_search("arcC", "TTGATTCACCAGCGCGTATTGTC", "AGGTATCTGCTTCAATCAGCG", tmp_basedir))
mlst_genes.append(perform_primer_search("aroE", "ATCGGAAATCCTATTTCACATTC", "GGTGTTGTATTAATAACGATATC", tmp_basedir))
mlst_genes.append(perform_primer_search("glpF", "CTAGGAACTGCAATCTTAATCC", "TGGTAAAATCGCATGTCCAATTC", tmp_basedir))
mlst_genes.append(perform_primer_search("gmk", "ATCGTTTTATCGGGACCATC", "TCATTAACTACAACGTAATCGTA", tmp_basedir))
mlst_genes.append(perform_primer_search("pta", "GTTAAAATCGTATTACCTGAAGG", "GACCCTTTTGTTGAAAAGCTTAA", tmp_basedir))
mlst_genes.append(perform_primer_search("tpi", "TCGTTCATTCTGAACGTCGTGAA", "TTTGCACCTTCTAACAATTGTAC", tmp_basedir))
mlst_genes.append(perform_primer_search("yqi", "CAGCATACAGGACACCTATTGGC", "CGTTGAGGAATCGATACTGGAAC", tmp_basedir))

import shutil
shutil.rmtree(tmp_basedir)

In [33]:
# Write each individual MLST gene to fasta.
mlst_index = []
for mlst_gene in mlst_genes:
    record = SeqRecord(
        seq=mlst_gene.seq,
        id=f"{mlst_id_prefix}_{mlst_gene.name}",
        description=f"{refseq_metadata_description} Reference: imputed using MLST primer"
    )

    filename = f'{mlst_gene.name}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    mlst_index.append({'Name': f'{mlst_gene.name}', 'Fasta': filename, 'Metadata': 'MLST'})

mlst_index = pd.DataFrame(mlst_index)

In [26]:
mlst_index

Unnamed: 0,Name,Fasta,Metadata
0,arcC,arcC.fasta,MLST
1,aroE,aroE.fasta,MLST
2,glpF,glpF.fasta,MLST
3,gmk,gmk.fasta,MLST
4,pta,pta.fasta,MLST
5,tpi,tpi.fasta,MLST
6,yqi,yqi.fasta,MLST


# Extract MetaPhlAn markers.

In [34]:
taxon_label = 's__Staphylococcus_aureus'
metaphlan_version_id = "mpa_vJun23_CHOCOPhlAnSGB_202307"
metaphlan_pkl_path = Path(f"/mnt/e/metaphlan_databases/{metaphlan_version_id}/{metaphlan_version_id}.pkl")

In [35]:
def extract_from_metaphlan(input_metaphlan_pkl: Path):
    parser = MetaphlanParser(input_metaphlan_pkl)

    # Extract reference seqs
    metaphlan_gene_records = []
    for marker_name, record in parser.retrieve_marker_seeds(taxon_label):
        marker_len = len(record.seq)
        print(f"Found marker `{marker_name}` (length {marker_len})")
        metaphlan_gene_records.append(record)
    return metaphlan_gene_records

In [13]:
metaphlan_genes = extract_from_metaphlan(metaphlan_pkl_path)

In [27]:
# Write each individual MetaPhlAn gene to fasta.
metaphlan_index = []
for metaphlan_gene in metaphlan_genes:
    record = SeqRecord(
        seq=metaphlan_gene.seq,
        id=metaphlan_gene.id,
        description=f"MLST {taxon_label} marker gene version {metaphlan_version_id}"
    )

    uniref_id, middle_tag, sgb_id = metaphlan_gene.id.split("|")
    # Turn this into something posix-friendly.
    filename = f'{uniref_id}-{middle_tag}-{sgb_id}.fasta'
    with open(gene_basedir / filename, 'wt') as f:
        SeqIO.write([record], f, 'fasta')
    
    metaphlan_index.append({'Name': f'{uniref_id}-{middle_tag}-{sgb_id}', 'Fasta': filename, 'Metadata': f'MetaPhlAn {metaphlan_version_id}'})

metaphlan_index = pd.DataFrame(metaphlan_index)

In [28]:
metaphlan_index

Unnamed: 0,Name,Fasta,Metadata
0,UniRef90_W8TVH0-2__6-SGB7852,UniRef90_W8TVH0-2__6-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
1,UniRef90_W8TSN1-1__3-SGB7852,UniRef90_W8TSN1-1__3-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
2,UniRef90_Q5HHT2-2__6-SGB7852,UniRef90_Q5HHT2-2__6-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
3,UniRef90_D7URR4-1__3-SGB7852,UniRef90_D7URR4-1__3-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
4,UniRef90_A0A0H3JRH2-1__3-SGB7852,UniRef90_A0A0H3JRH2-1__3-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
...,...,...,...
195,UniRef90_A0A0U1MN04-1__4-SGB7852,UniRef90_A0A0U1MN04-1__4-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
196,UniRef90_A0A2K4AGG7-1__4-SGB7852,UniRef90_A0A2K4AGG7-1__4-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
197,UniRef90_A0A2Y1JNL3-3__6-SGB7852,UniRef90_A0A2Y1JNL3-3__6-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307
198,UniRef90_A0A0E1X8G4-2__6-SGB7852,UniRef90_A0A0E1X8G4-2__6-SGB7852.fasta,MetaPhlAn mpa_vJun23_CHOCOPhlAnSGB_202307


# Write the index files.

In [31]:
concat_index = pd.concat([mlst_index, metaphlan_index])
concat_index.to_csv(gene_basedir / "marker_seed_index.tsv", sep='\t', index=False, header=False)