# File paths and environment variables

In [19]:
from pathlib import Path
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


""" ============================================ EDIT THESE SETTINGS BASED ON USER'S CHOICE. ============================================ """
""" RefSeq catalog settings"""
TARGET_DIR = Path("/mnt/e/ecoli_db")  # the base directory for everything else.
TARGET_TAXA = "Enterobacteriaceae"  # the taxonomic identifier to download. Can be species, genus or even family.
NCBI_REFSEQ_DIR = TARGET_DIR / "ref_genomes"
REFSEQ_INDEX = NCBI_REFSEQ_DIR / "index.tsv"

""" RefSeq BLAST database """
BLAST_DB_DIR = TARGET_DIR / "blast_db"
BLAST_DB_NAME = "Enterobacteriaceae_refseq"  # Blast DB to create.

""" Marker seeds """
MARKER_SEED_DIR = TARGET_DIR / "marker_seeds"
MARKER_SEED_INDEX = MARKER_SEED_DIR / "marker_seed_index.tsv"

""" chronostrain-specific settings """
NUM_CORES = 8  # number of cores to use (e.g. for blastn)
MIN_PCT_IDTY = 75  # accept BLAST hits as markers above this threshold.
CHRONOSTRAIN_DB_DIR = TARGET_DIR / "chronostrain_files"  # The directory to use for chronostrain's database files.
CHRONOSTRAIN_TARGET_JSON = CHRONOSTRAIN_DB_DIR / "ecoli.json"  # the desired final product.
DASHING2_DIR = Path("/home/youn/work/bin")  # Directory that contains the dashing2 executable.

""" MetaPhlAn settings """
METAPHLAN_DB_PATH = Path("/mnt/e/metaphlan_databases/mpa_vJan21_CHOCOPhlAnSGB_202103/mpa_vJan21_CHOCOPhlAnSGB_202103.pkl") # MetaPhlan 3 or newer
METAPHLAN_TAXONOMIC_KEY = 's__Escherichia_coli'

""" ============================================ DO NOT EDIT BELOW ============================================ """
""" environment variable extraction """
try:
    VARS_SET
except NameError:
    VARS_SET = True
    _cwd = %pwd
    _parent_cwd = Path(_cwd).parent
    _start_path = %env PATH

# Work in parent directory, where all the helper scripts and settings.sh are.
%cd "$_parent_cwd"
%env TARGET_TAXA=$TARGET_TAXA
%env NCBI_REFSEQ_DIR=$NCBI_REFSEQ_DIR
%env REFSEQ_INDEX=$REFSEQ_INDEX
# Need basic executables, such as "which" and "basename" (required by primersearch)
%env PATH=/usr/bin:$_start_path:$DASHING2_DIR

/home/youn/work/chronostrain/examples/database
env: TARGET_TAXA=Enterobacteriaceae
env: NCBI_REFSEQ_DIR=/mnt/e/ecoli_db/ref_genomes
env: REFSEQ_INDEX=/mnt/e/ecoli_db/ref_genomes/index.tsv
env: PATH=/usr/bin:/home/youn/mambaforge/envs/chronostrain/bin:/home/youn/work/bin


In [10]:
!primersearch --version
!dashing2 --version

EMBOSS:6.6.0.0
#Calling Dashing2 version v2.1.19 with command '/home/youn/work/chronostrain/examples/database/dashing2 --version'
dashing2 has several subcommands: sketch, cmp, wsketch, and contain.
Usage can be seen in those subcommands. (e.g., `dashing2 sketch -h`)

	sketch: converts FastX into k-mer sets/sketches, and sketches BigWig and BED files; also contains functionality from cmp, for one-step sketch and comparisons
This is probably the most common subcommand to use.

	cmp: compares previously sketched/decomposed k-mer sets and emits results. alias: dist

	contain: Takes a k-mer database (built with dashing2 sketch --save-kmers), then computes coverage for all k-mer references using input streams.
	wsketch: Takes a tuple of [1-3] input binary files [(u32 or u64), (float or double), (u32 or u64)] and performs weighted minhash sketching.
Three files are treated as Compressed Sparse Row (CSR)-format, where the third file contains indptr values, specifying the lengths of consecutiv

# Recipe starts here.

In [2]:
# Prepare directories.
TARGET_DIR.mkdir(exist_ok=True, parents=True)
BLAST_DB_DIR.mkdir(exist_ok=True, parents=True)
NCBI_REFSEQ_DIR.mkdir(exist_ok=True, parents=True)
MARKER_SEED_DIR.mkdir(exist_ok=True, parents=True)

### Download RefSeq catalog.

In [3]:
!bash download_ncbi2.sh

^C

Aborted!


### Step 2: Build Blast database.

In [None]:
!bash create_blast_db.sh

### Step 3: Build the marker seed catalog.

**GOAL**: a FASTA file of marker seeds (one multi-fasta file per marker gene), and a single TSV file that catalogs them.

However, to get there, we need to take a few steps...

#### Step 3.1: Download MLST schema.

In [4]:
!python python_helpers/mlst_download.py -t "Escherichia coli" -w "$TARGET_DIR"/mlst_schema -o "$MARKER_SEED_DIR"/mlst_seeds.tsv

Downloading marker seeds from MLST schema.
Targeting 1 taxa using MLST scheme.
Fetching URL resource https://pubmlst.org/static/data/dbases.xml
Got a response of size 152.35 KB.
Schema type id: 1
Handling locus adk
Fetching URL resource https://rest.pubmlst.org/db/pubmlst_ecoli_achtman_seqdef/loci/adk/alleles_fasta
Got a response of size 652.95 KB.
Handling locus fumC
Fetching URL resource https://rest.pubmlst.org/db/pubmlst_ecoli_achtman_seqdef/loci/fumC/alleles_fasta
Got a response of size 931.99 KB.
Handling locus gyrB
Fetching URL resource https://rest.pubmlst.org/db/pubmlst_ecoli_achtman_seqdef/loci/gyrB/alleles_fasta
Got a response of size 550.44 KB.
Handling locus icd
Fetching URL resource https://rest.pubmlst.org/db/pubmlst_ecoli_achtman_seqdef/loci/icd/alleles_fasta
Got a response of size 835.36 KB.
Handling locus mdh
Fetching URL resource https://rest.pubmlst.org/db/pubmlst_ecoli_achtman_seqdef/loci/mdh/alleles_fasta
Got a response of size 567.82 KB.
Handling locus purA
Fetch

#### Step 3.2: Non-standard genes: run GFF-annotated gene search with primers across entire catalog.

In [7]:
"""
Search #1:
-------------
Search for O-antigen gene cluster.
JumpSTART 5'--3' primer = CATGGTAGCTGTAAAGCCAGGGGCGGTAGCGTG
GND 5'--3' primer = CATGCTGCCATACCGACGACGCCGATCTGTTGCTTKGACA
"""

# Note: Here, we use GFF annotation to extract the specific genes from the cluster.
!python python_helpers/identify_gene_cluster_primersearch.py \
    -i "$REFSEQ_INDEX" \
    -t "$TARGET_DIR"/_tmp \
    -o "$MARKER_SEED_DIR"/O_antigen_cluster.feather \
    -g "Escherichia" -s "coli" \
    -p1 "CATGGTAGCTGTAAAGCCAGGGGCGGTAGCGTG" \
    -p2 "CATGCTGCCATACCGACGACGCCGATCTGTTGCTTKGACA" \
    -n "O_antigen_cluster" \
    --use-gff

Performing primer-based search for O_antigen_cluster in Escherichia coli. (FWD=CATGGTAGCTGTAAAGCCAGGGGCGGTAGCGTG, REV=CATGCTGCCATACCGACGACGCCGATCTGTTGCTTKGACA, len approx. None)
Will use GFF to extract annotated genes.
100%|███████████████████████████████████| 2063/2063 [21:37<00:00,  1.59genome/s]
Wrote 23983 dataframe records to O_antigen_cluster.feather


In [9]:
"""
Search #2:
------------
fim genes: fim*
H antigen: fliC, flk* fll* flm*
Shigatoxin: stx*
"""
!python python_helpers/extract_genes_by_name.py \
    -i "$REFSEQ_INDEX" \
    -o "$MARKER_SEED_DIR"/misc_genes.feather \
    -g "Escherichia" -s "coli" \
    -p "fim" -p "fliC" -p "flk" -p "fll" -p "flm" -p "stx"

100%|███████████████████████████████████████| 2063/2063 [08:17<00:00,  4.15it/s]


#### Step 3.3: Convert previous step to marker seed (multi-)FASTA files.

ChronoStrain uses FASTA files to read in marker seeds. The previous step merely creates a catalog of raw gene catalog.

We need to generate a FASTA file, but some cleanup has to be done, since sometimes genes are mis-annotated. (example: "stx1A" sometimes shows up as "stxA1")

In [13]:
np.random.seed(1234)  # for reproducibility

# ========== Load the dataframe.
misc_df = pd.read_feather(MARKER_SEED_DIR / "misc_genes.feather")
misc_df['GeneFull'] = misc_df['Gene']
index_df = pd.read_csv("/mnt/e/ecoli_db/ref_genomes/index.tsv", sep='\t')
misc_df = misc_df.merge(index_df[['Accession', 'Species']], on='Accession')
misc_df = misc_df.loc[misc_df['Species'] == 'coli']  # this is probably no longer needed as of latest version of this notebook.

print("Raw counts:")
display(misc_df.groupby("Gene")['Accession'].count().rename("Counts Per Gene"))

# Subsample if more than 1000 hits.
misc_df = misc_df.groupby("Gene").apply(
    lambda x: x.sample(n=min(1000, x.shape[0]))
)
misc_df = misc_df.set_index([pd.Index(list(range(misc_df.shape[0])))])


# STX labels are inconsistent. Fix them.
misc_df.loc[misc_df['Gene'] == 'stx1A', 'Gene'] = 'stxA1'
misc_df.loc[misc_df['Gene'] == 'stx2A', 'Gene'] = 'stxA2'
misc_df.loc[misc_df['Gene'] == 'stx1B', 'Gene'] = 'stxB1'
misc_df.loc[misc_df['Gene'] == 'stx2B', 'Gene'] = 'stxB2'

# print statistics
print("After fixing & subsampling: ")
display(misc_df.groupby("Gene")['Accession'].count().rename("Counts Per Gene"))
display(misc_df)

Raw counts:


Gene
fim41a       1
fimA      4701
fimB      1625
fimC      3420
fimD      1719
fimE      1765
fimF      1858
fimG      1872
fimH      3469
fimI      1771
fimZ      1642
fliC       220
flk       2049
stx1A        1
stx1B        1
stx2A        1
stx2B        1
stxA1      166
stxA2       49
stxB1        2
stxB2       18
Name: Counts Per Gene, dtype: int64

After fixing & subsampling: 


Gene
fim41a       1
fimA      1000
fimB      1000
fimC      1000
fimD      1000
fimE      1000
fimF      1000
fimG      1000
fimH      1000
fimI      1000
fimZ      1000
fliC       220
flk       1000
stxA1      167
stxA2       50
stxB1        3
stxB2       19
Name: Counts Per Gene, dtype: int64

Unnamed: 0,Accession,Gene,GeneSeq,GeneFull,Species
0,NZ_CP026929.2,fim41a,ATGAAAAAGACTCTGATTGCACTGGCTGTGGCTGCATCAGCGGCAG...,fim41a,coli
1,NZ_AP021998.1,fimA,ATGAAAATTAAAACTCTGGCAATCGTTGTTCTGTCGGCTCTGTCCC...,fimA,coli
2,NZ_CP061337.1,fimA,ATGAAAATTAAAACTCTGGCAATCGTTGTTCTGTCGGCTCTGTCCC...,fimA,coli
3,NZ_CP076245.1,fimA,ATGAAACTCAAACATGTTGGTATGATTGTCGTTTCTGTGTTGGCGA...,fimA,coli
4,NZ_CP010140.1,fimA,ATGAAATTAAGATTTATTTCGTCTGCGCTGGCTGCCGCACTATTCG...,fimA,coli
...,...,...,...,...,...
11455,NZ_AP018488.1,stxB2,ATGAAGAAGATGTTTATGGCGGTTTTATTTGCATTAGTTTCTGTTA...,stxB2,coli
11456,NZ_CP028379.1,stxB2,ATGAAGATGATGTTTATGGCGGTTTTATTTGCATTAGTTTCTGTTA...,stxB2,coli
11457,NZ_CP031343.1,stxB2,ATGAAGATGATGTTTATGGCGGTTTTATTTGCATTAGTTTCTGTTA...,stxB2,coli
11458,NZ_CP076706.1,stxB2,ATGAAGAAGATGTTTATGGTGGCTTTATTTGCGTTAGTTTCTGTTA...,stxB2,coli


In [15]:
np.random.seed(1235)  # for reproducibility

# ========== Load the dataframe.
serotype_O_df = pd.read_feather(MARKER_SEED_DIR / "O_antigen_cluster.feather")
serotype_O_df['GeneFull'] = serotype_O_df['Gene']

index_df = pd.read_csv("/mnt/e/ecoli_db/ref_genomes/index.tsv", sep='\t')
serotype_O_df = serotype_O_df.merge(index_df[['Accession', 'Species']], on='Accession')
serotype_O_df = serotype_O_df.loc[serotype_O_df['Species'] == 'coli']

print("Raw counts: ")
display(serotype_O_df.groupby("Gene")['Accession'].count())


# Subsample if more than 1000 hits.
serotype_O_df = serotype_O_df.groupby("Gene").apply(
    lambda x: x.sample(n=min(1000, x.shape[0]))
)
serotype_O_df = serotype_O_df.set_index([pd.Index(list(range(serotype_O_df.shape[0])))])


# Remove all putative genes (e.g. WM47_RS18200). Annotated genes typically follow a 3-letter format (followed by a specifier).
ecoli_serotype_O_genes = set()
for x in serotype_O_df.groupby("Gene")['Accession'].count().index:
    if "_" in x:
        continue
    ecoli_serotype_O_genes.add(x)
serotype_O_df = serotype_O_df.loc[serotype_O_df['Gene'].isin(ecoli_serotype_O_genes)]


# print statistics
print("After fixing & subsampling: ")
display(serotype_O_df.groupby("Gene")['Accession'].count())
display(serotype_O_df)

Raw counts: 


Gene
A0I22_RS25315       1
A0I22_RS25325       1
A0I22_RS25340       1
A0I22_RS25350       1
A0I22_RS25355       1
                 ... 
wzy              1324
yfgO                1
ypfH                1
ypfJ                1
ypfN                1
Name: Accession, Length: 11232, dtype: int64

After fixing & subsampling: 


Gene
acs        1
arnB       1
asnB      11
bamC       1
bcp        1
        ... 
wzy     1000
yfgO       1
ypfH       1
ypfJ       1
ypfN       1
Name: Accession, Length: 95, dtype: int64

Unnamed: 0,Accession,Gene,GeneSeq,GeneFull,Species
11137,NZ_CP016546.1,acs,ATGAGCCAAATTCACAAACACACCATTCCTGCCAACATCGCAGACC...,acs,coli
11138,NC_002695.2,arnB,ATGAAATATATACCAGTTTACCAACCGTCATTGACAGGAAAAGAAA...,arnB,coli
11139,NZ_CP019051.1,asnB,ATGTTGCATAAAATTAGACATCGTGGACCCGATTCATTCGGGATTT...,asnB,coli
11140,NZ_CP043825.1,asnB,ATGTGTGGATTAGCTGGTTTCCTTGAATCAAATTTAAATGATAATA...,asnB,coli
11141,NZ_CP031910.1,asnB,ATGTGTGGATTAGCTGGTTTCCTTGAATCAAATTTAAATGATAATA...,asnB,coli
...,...,...,...,...,...
22197,NZ_CP031919.1,wzy,GTGAATATAAAGAAAGATAAGTTTATAAATGGAGTGCTTTTTTTTT...,wzy,coli
22198,NZ_CP015085.1,yfgO,ATGCTCGAAATGTTGATGCAATGGTATCGCCGCCGTTTTAGCGACC...,yfgO,coli
22199,NZ_CP015085.1,ypfH,ATGAAACATGACCATTTTGTTGTTCAAAGCCCGGATAAACCAGCAC...,ypfH,coli
22200,NZ_CP015085.1,ypfJ,ATGCGTTGGCAAGGGCGACGTGAAAGTGACAATGTTGAAGACAGGC...,ypfJ,coli


In [16]:
print("Concatenating.")
concat_gene_df = pd.concat([
    misc_df, 
    serotype_O_df
], ignore_index=True)
print("# CUSTOM GENES TOTAL = {}".format( len(pd.unique(concat_gene_df['Gene'])) ))


# ======= Write FASTA files.
print("Generating FASTA files.")
for gene, section in concat_gene_df.groupby("Gene"):
    if gene.endswith("*"):
        prefix = gene[:-1]
        gene_name_for_file = f"{prefix}_any"
    else:
        gene_name_for_file = gene

    target_fasta = MARKER_SEED_DIR / f'{gene_name_for_file}.fasta'
    
    print("Handling {} ({} records) --> {}".format(gene, section.shape[0], target_fasta))
    with open(target_fasta, 'wt') as f:
        for row_idx, row in section.iterrows():
            record = SeqRecord(
                Seq(row['GeneSeq']),
                id="{}_{}".format(gene, row_idx),
                description="Src={}:{}".format(row['Accession'], row['GeneFull'])
            )
            SeqIO.write([record], f, 'fasta')

            
# ======= Human-readable metadata (to be written to TSV)
def assign_misc_metadata(gene_name):
    if gene_name.startswith("fim"):
        return "Fimbrial_gene".format(gene_name)
    elif gene_name.startswith("stx"):
        return "ShigaToxin".format(gene_name)
    elif gene_name == "fliC" or gene_name.startswith("flk") or gene_name.startswith("fll") or gene_name.startswith("flm"):
        return "H-antigen"
    elif gene_name in set(pd.unique(serotype_O_df['Gene'])):
        return "O-antigen"
    else:
        raise ValueError("Unknown gene category {}".format(gene_name))


# ======= write index TSV file.
tsv_path = MARKER_SEED_DIR / "manual_seeds.tsv"
print(f"Writing TSV file: {tsv_path}")

with open(tsv_path, 'wt') as metadata_f:
    for gene, section in concat_gene_df.groupby("Gene"):
        if gene.endswith("*"):
            prefix = gene[:-1]
            gene_nameAC_for_file = f"{prefix}_any"
        else:
            gene_name_for_file = gene

        target_fasta = MARKER_SEED_DIR / f'{gene_name_for_file}.fasta'
        print(
            "{}\t{}\t{}".format(
                gene_name_for_file, target_fasta, assign_misc_metadata(gene)
            ), 
            file=metadata_f
        )

print("Cleaning up.")
del concat_gene_df
del misc_df
del serotype_O_df

Concatenating.
# CUSTOM GENES TOTAL = 112
Generating FASTA files.
Handling acs (1 records) --> /mnt/e/ecoli_db/marker_seeds/acs.fasta
Handling arnB (1 records) --> /mnt/e/ecoli_db/marker_seeds/arnB.fasta
Handling asnB (11 records) --> /mnt/e/ecoli_db/marker_seeds/asnB.fasta
Handling bamC (1 records) --> /mnt/e/ecoli_db/marker_seeds/bamC.fasta
Handling bcp (1 records) --> /mnt/e/ecoli_db/marker_seeds/bcp.fasta
Handling bepA (1 records) --> /mnt/e/ecoli_db/marker_seeds/bepA.fasta
Handling cpsB (172 records) --> /mnt/e/ecoli_db/marker_seeds/cpsB.fasta
Handling cpsG (570 records) --> /mnt/e/ecoli_db/marker_seeds/cpsG.fasta
Handling dapA (1 records) --> /mnt/e/ecoli_db/marker_seeds/dapA.fasta
Handling dapE (1 records) --> /mnt/e/ecoli_db/marker_seeds/dapE.fasta
Handling fcf1 (1 records) --> /mnt/e/ecoli_db/marker_seeds/fcf1.fasta
Handling fcl (111 records) --> /mnt/e/ecoli_db/marker_seeds/fcl.fasta
Handling fdhF (1 records) --> /mnt/e/ecoli_db/marker_seeds/fdhF.fasta
Handling fim41a (1 reco

#### Step 3.4 add MetaPhlAn markers.

In [17]:
!python python_helpers/extract_metaphlan_markers.py \
    -t "$METAPHLAN_TAXONOMIC_KEY" \
    -i "$METAPHLAN_DB_PATH" \
    -o "$MARKER_SEED_DIR"/metaphlan_seeds.tsv

Searching for marker seeds from MetaPhlAn database: mpa_vJan21_CHOCOPhlAnSGB_202103.
Target # of markers: 64
Found metaphlan marker ID SGB10068__DKCKNNOD_02965.
Found marker `SGB10068__DKCKNNOD_02965` (length 669)
Found metaphlan marker ID SGB10068__BOKFPOAE_00433.
Found marker `SGB10068__BOKFPOAE_00433` (length 759)
Found metaphlan marker ID SGB10068__KKFLDMED_01050.
Found marker `SGB10068__KKFLDMED_01050` (length 720)
Found metaphlan marker ID SGB10068__PPJOBBMP_00387.
Found marker `SGB10068__PPJOBBMP_00387` (length 657)
Found metaphlan marker ID SGB10068__ACFBEDFH_01712.
Found marker `SGB10068__ACFBEDFH_01712` (length 1110)
Found metaphlan marker ID SGB10068__GIPCMNHK_02102.
Found marker `SGB10068__GIPCMNHK_02102` (length 714)
Found metaphlan marker ID SGB10068__NEHLOBGE_03055.
Found marker `SGB10068__NEHLOBGE_03055` (length 648)
Found metaphlan marker ID SGB10068__EMBPIGEI_03969.
Found marker `SGB10068__EMBPIGEI_03969` (length 597)
Found metaphlan marker ID SGB10068__NIEECKMC_04630

#### Step 3.5: process and combine marker TSV files.

In [18]:
!cat "$MARKER_SEED_DIR"/mlst_seeds.tsv > $MARKER_SEED_INDEX
!cat "$MARKER_SEED_DIR"/manual_seeds.tsv >> $MARKER_SEED_INDEX
!cat "$MARKER_SEED_DIR"/metaphlan_seeds.tsv >> $MARKER_SEED_INDEX

print("Created Marker seed index: {}".format(MARKER_SEED_INDEX))
assert MARKER_SEED_INDEX.exists()

Created Marker seed index: /mnt/e/ecoli_db/marker_seeds/marker_seed_index.tsv


### Step 4: Run Chronostrain's make-db command.

By the end of the previous step, we have:

1) FASTA files for each gene, listing out seed sequence(s).
2) A TSV file (marker_seed_index.tsv) containing a list of gene names and the paths to each of these FASTA files.

In [20]:
!env \
    JAX_PLATFORM_NAME=cpu \
    CHRONOSTRAIN_DB_DIR="$CHRONOSTRAIN_DB_DIR" \
    chronostrain -c chronostrain.ini \
      make-db \
      -m $MARKER_SEED_INDEX \
      -r $REFSEQ_INDEX \
      -b $BLAST_DB_NAME -bd $BLAST_DB_DIR \
      --min-pct-idty $MIN_PCT_IDTY \
      --ident-threshold 0.998 \
      -o $CHRONOSTRAIN_TARGET_JSON \
      --threads $NUM_CORES

2023-09-21 15:59:43,178 [INFO - chronostrain.cli] - Creating symbolic links to reference catalog (target dir: /mnt/e/ecoli_db/chronostrain_files)
2023-09-21 15:59:44,543 [INFO - chronostrain.cli] - Building raw DB using BLAST.
2023-09-21 15:59:44,546 [INFO - chronostrain.cli] - Running blastn on adk.
2023-09-21 16:00:08,809 [INFO - chronostrain.cli] - Running blastn on fumC.
2023-09-21 16:00:09,769 [INFO - chronostrain.cli] - Running blastn on gyrB.
2023-09-21 16:00:10,866 [INFO - chronostrain.cli] - Running blastn on icd.
2023-09-21 16:00:12,057 [INFO - chronostrain.cli] - Running blastn on mdh.
2023-09-21 16:00:13,079 [INFO - chronostrain.cli] - Running blastn on purA.
2023-09-21 16:00:14,264 [INFO - chronostrain.cli] - Running blastn on recA.
2023-09-21 16:00:15,450 [INFO - chronostrain.cli] - Running blastn on dinB.
2023-09-21 16:00:16,219 [INFO - chronostrain.cli] - Running blastn on icdA.
2023-09-21 16:00:17,431 [INFO - chronostrain.cli] - Running blastn on pabB.
2023-09-21 16:00