In [1]:
import pathlib

import pandas as pd

## Parse the NCBI BLAST outputs

1. We BLAST the query sequences in `sequences.fna` against the NCBI 16S reference database using the web tool
2. The outputs are downloaded into `ncbi16s_desc.csv` and `ncbi16s_hits.csv`

In [2]:
data_path = pathlib.Path("../../data/processed/kchip")
blast_path = data_path / "blast"

In [3]:
# Headers for the hits and desc files
ncbi_hits_header = "qid,Accession,pidentity,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore".split(",")
ncbi_desc_header = "Description,Scientific Name,Max Score,Total Score,Query Cover,E value,Per. ident,Acc. Len,Accession Full".split(",")

In [4]:
ncbi_blast_hits_file = blast_path / "ncbi16s_hits.csv"
ncbi_blast_desc_file = blast_path / "ncbi16s_desc.csv"
ncbi_blast_hits = pd.read_csv(ncbi_blast_hits_file, sep=",", names=ncbi_hits_header)
ncbi_blast_desc = pd.read_csv(ncbi_blast_desc_file, sep=",", skiprows=1, names=ncbi_desc_header)

In [5]:
# Filter the blast hits based on E-value
ncbi_blast_hits = ncbi_blast_hits[ncbi_blast_hits.evalue <= 1e-50]
ncbi_blast_hits["qid"] = [qid.split("_")[0] for qid in ncbi_blast_hits.qid]
ncbi_blast_hits

Unnamed: 0,qid,Accession,pidentity,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,CR-75,NR_180446.1,98.587,1415,13,7,1,1411,52,1463,0.0,2495
1,CR-75,NR_180451.1,98.303,1414,19,5,1,1411,49,1460,0.0,2473
2,CR-75,NR_126208.1,98.292,1405,19,5,10,1411,1,1403,0.0,2457
3,CR-75,NR_028894.1,98.092,1415,20,6,1,1411,32,1443,0.0,2457
4,CR-75,NR_028687.1,98.092,1415,20,6,1,1411,46,1457,0.0,2457
...,...,...,...,...,...,...,...,...,...,...,...,...
843,O-G1,NR_042356.1,97.660,1410,29,4,1,1406,17,1426,0.0,2418
844,O-G1,NR_159317.1,98.014,1410,26,2,1,1408,25,1434,0.0,2451
845,O-G1,NR_180640.1,97.872,1410,28,2,1,1408,52,1461,0.0,2436
846,O-G1,NR_041927.1,97.943,1410,27,2,1,1408,21,1430,0.0,2451


In [6]:
# Display the number of hits for each query sequence
ncbi_blast_hits.groupby("qid")["Accession"].count()

qid
CR-75                    100
K-A10                     54
K-A4                      54
K-A7                      79
K-C10                     47
K-D4                       8
K-D8                      71
K-F4                      42
K-F8                      51
K-H10                      7
M-B5                      58
M-C10                     82
M-G11                     81
O-G1                      73
escherichia-coli-k-12     41
Name: Accession, dtype: int64

We need to parse the accession id from the accession string in the description file

In [7]:
ncbi_blast_desc["Accession"] = ncbi_blast_desc["Accession Full"].apply(lambda x: x.split(",")[-1].strip('")'))
ncbi_blast_desc

Unnamed: 0,Description,Scientific Name,Max Score,Total Score,Query Cover,E value,Per. ident,Acc. Len,Accession Full,Accession
0,Scandinavium goeteborgense strain CCUG 66741 1...,Scandinavium goeteborgense,2495,2495,100%,0.0,98.59,1542,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_180446.1
1,Enterobacter quasihormaechei strain WCHEs12000...,Enterobacter quasihormaechei,2473,2473,100%,0.0,98.30,1538,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_180451.1
2,Enterobacter hormaechei subsp. xiangfangensis ...,Enterobacter hormaechei subsp. xiangfangensis,2457,2457,99%,0.0,98.29,1429,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_126208.1
3,Citrobacter freundii ATCC 8090 = MTCC 1658 = N...,Citrobacter freundii ATCC 8090 = MTCC 1658 = N...,2457,2457,100%,0.0,98.09,1505,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_028894.1
4,Citrobacter braakii strain 167 16S ribosomal R...,Citrobacter braakii,2457,2457,100%,0.0,98.09,1530,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_028687.1
...,...,...,...,...,...,...,...,...,...,...
95,Raoultella terrigena strain 84 16S ribosomal R...,Raoultella terrigena,2368,2368,100%,0.0,96.96,1454,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_037085.1
96,Cedecea lapagei strain DSM 4587 16S ribosomal ...,Cedecea lapagei,2368,2368,100%,0.0,96.96,1531,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_126319.1
97,Cronobacter universalis strain E797 16S riboso...,Cronobacter universalis,2368,2368,100%,0.0,96.89,1425,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_115943.1
98,Cronobacter sakazakii strain E601 16S ribosoma...,Cronobacter sakazakii,2368,2368,100%,0.0,96.96,1424,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle...",NR_115942.1


Finally, we join the hits and description files together

In [8]:
ncbi_blast_results = ncbi_blast_hits.merge(ncbi_blast_desc, how="left", on="Accession")

In [9]:
ncbi_blast_results

Unnamed: 0,qid,Accession,pidentity,length,mismatch,gapopen,qstart,qend,sstart,send,...,bitscore,Description,Scientific Name,Max Score,Total Score,Query Cover,E value,Per. ident,Acc. Len,Accession Full
0,CR-75,NR_180446.1,98.587,1415,13,7,1,1411,52,1463,...,2495,Scandinavium goeteborgense strain CCUG 66741 1...,Scandinavium goeteborgense,2495,2495,100%,0.0,98.59,1542,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
1,CR-75,NR_180451.1,98.303,1414,19,5,1,1411,49,1460,...,2473,Enterobacter quasihormaechei strain WCHEs12000...,Enterobacter quasihormaechei,2473,2473,100%,0.0,98.30,1538,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
2,CR-75,NR_126208.1,98.292,1405,19,5,10,1411,1,1403,...,2457,Enterobacter hormaechei subsp. xiangfangensis ...,Enterobacter hormaechei subsp. xiangfangensis,2457,2457,99%,0.0,98.29,1429,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
3,CR-75,NR_028894.1,98.092,1415,20,6,1,1411,32,1443,...,2457,Citrobacter freundii ATCC 8090 = MTCC 1658 = N...,Citrobacter freundii ATCC 8090 = MTCC 1658 = N...,2457,2457,100%,0.0,98.09,1505,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
4,CR-75,NR_028687.1,98.092,1415,20,6,1,1411,46,1457,...,2457,Citrobacter braakii strain 167 16S ribosomal R...,Citrobacter braakii,2457,2457,100%,0.0,98.09,1530,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,O-G1,NR_042356.1,97.660,1410,29,4,1,1406,17,1426,...,2418,Serratia ureilytica strain NiVa 51 16S ribosom...,Serratia ureilytica,2374,2374,99%,0.0,97.10,1479,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
844,O-G1,NR_159317.1,98.014,1410,26,2,1,1408,25,1434,...,2451,Klebsiella grimontii strain SB73 16S ribosomal...,Klebsiella grimontii,2372,2372,100%,0.0,96.96,1454,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
845,O-G1,NR_180640.1,97.872,1410,28,2,1,1408,52,1461,...,2436,Klebsiella pasteurii strain SPARK836C1 16S rib...,Klebsiella pasteurii,2372,2372,100%,0.0,97.03,1540,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
846,O-G1,NR_041927.1,97.943,1410,27,2,1,1408,21,1430,...,2451,Pseudescherichia vulneris strain ATCC 33821 16...,Pseudescherichia vulneris,2370,2370,100%,0.0,96.89,1468,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."


In [10]:
top5_blasthits_per_qid = ncbi_blast_results.groupby("qid", as_index=False).apply(lambda grp: grp.nlargest(5, "bitscore"))

In [11]:
top5_blasthits_per_qid.loc[:, ["qid", "Accession", "Scientific Name", "pidentity"]].head(20)

Unnamed: 0,Unnamed: 1,qid,Accession,Scientific Name,pidentity
0,0,CR-75,NR_180446.1,Scandinavium goeteborgense,98.587
0,1,CR-75,NR_180451.1,Enterobacter quasihormaechei,98.303
0,2,CR-75,NR_126208.1,Enterobacter hormaechei subsp. xiangfangensis,98.292
0,3,CR-75,NR_028894.1,Citrobacter freundii ATCC 8090 = MTCC 1658 = N...,98.092
0,4,CR-75,NR_028687.1,Citrobacter braakii,98.092
1,173,K-A10,NR_113614.1,Klebsiella aerogenes,98.935
1,176,K-A10,NR_102493.2,Klebsiella aerogenes KCTC 2190,98.935
1,179,K-A10,NR_024643.1,Klebsiella aerogenes,98.933
1,177,K-A10,NR_114737.1,Klebsiella aerogenes,98.793
1,157,K-A10,NR_184601.1,Huaxiibacter chinensis,98.651


These don't match the hits from the publication:


![image](strain_source_table.png)

## Download genomes from NCBI using BioPython (Entrez)

In [12]:
from Bio import Entrez

In [13]:
Entrez.email = "kishored@ornl.gov"

In [14]:
import urllib.request

def download_file(ftp_url: str, output_file: pathlib.Path):
    url = ftp_url.replace("ftp://", "http://")
    with urllib.request.urlopen(url) as url_id:
        with open(output_file, "wb") as fid:
            fid.write(url_id.read())

def download_genomes(scientific_name: str, output_folder: pathlib.Path) -> None:
    # Get list of reference or representative sequences for the given organimsm using its name
    term = f"({scientific_name}[ORGN]) AND (reference_genome[filter] OR representative_genome[filter])"
    search_handle = Entrez.esearch(db="assembly", idtype="acc", term=term, retmax=100)
    search_record = Entrez.read(search_handle)
    # Get the the accession number of the first result
    if len(search_record["IdList"]) == 0: # type: ignore
        raise ValueError(f"No reference or representative genomes found for {scientific_name}")
    if len(search_record["IdList"]) > 1: # type: ignore
        print(f"More than one reference or representative genomes found for {scientific_name}. Using the first one.")
    assembly_acc = search_record["IdList"][0] # type: ignore
    # Use document summary to get the ftp path for the assembly
    summary_handle = Entrez.esummary(db="assembly", id=assembly_acc)
    document_summary = Entrez.read(summary_handle)
    url = document_summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq'] # type: ignore
    file_id = url.split("/")[-1]
    fna_url = f"{url}/{file_id}_genomic.fna.gz"
    fna_file = output_folder / f"{file_id}.fna.gz"
    download_file(fna_url, fna_file)
    gff_url = f"{url}/{file_id}_genomic.gff.gz"
    gff_file = output_folder / f"{file_id}.gff.gz"
    download_file(gff_url, gff_file)

In [15]:
seq_folder = pathlib.Path("../../data/processed/kchip/sequences/")
for i, row in top5_blasthits_per_qid.iterrows():
    qid = row["qid"]
    scientific_name = " ".join(row["Scientific Name"].split(" ")[:2])
    print(f"{i}: Downloading fna and gff files for {scientific_name} matching {qid}")
    output_folder = seq_folder / f"{qid}"
    output_folder.mkdir(parents=True, exist_ok=True)
    download_genomes(scientific_name, output_folder=output_folder)

(0, 0): Downloading fna and gff files for Scandinavium goeteborgense matching CR-75
(0, 1): Downloading fna and gff files for Enterobacter quasihormaechei matching CR-75
(0, 2): Downloading fna and gff files for Enterobacter hormaechei matching CR-75
(0, 3): Downloading fna and gff files for Citrobacter freundii matching CR-75
(0, 4): Downloading fna and gff files for Citrobacter braakii matching CR-75
(1, 173): Downloading fna and gff files for Klebsiella aerogenes matching K-A10
(1, 176): Downloading fna and gff files for Klebsiella aerogenes matching K-A10
(1, 179): Downloading fna and gff files for Klebsiella aerogenes matching K-A10
(1, 177): Downloading fna and gff files for Klebsiella aerogenes matching K-A10
(1, 157): Downloading fna and gff files for Huaxiibacter chinensis matching K-A10
(2, 239): Downloading fna and gff files for Pantoea endophytica matching K-A4
(2, 218): Downloading fna and gff files for Leclercia adecarboxylata matching K-A4
(2, 222): Downloading fna and g

## Media information

The medium used in the coculture experiment was an M9 minimal medium consisting of 1× M9 salts (Teknova), 1× trace metals (Teknova), 0.1 mM calcium chloride, and 2 mM magnesium sulfate. We additionally added 0.05% (w/v) bovine serum albumin (BSA) to the medium to improve the retention of fluorescent dyes used in the droplet color codes

In the coculture experiment, a total of 40 environmental conditions were used. These included the 33 chosen compounds at 0.5% (w/v), 5 of these compounds (glucose, glycerol, pyruvate, proline, and sucrose) at 0.05% (w/v), an even mix of all 33 compounds [totaling 0.5% (w/v)], and a no-carbon control.

Microbial culture:

All labeled and unlabeled monocultures initially underwent two pre-experiment regrowth cycles (“starter phase” in a rich medium and “preculture phase” in minimal medium) and, at the onset of the experiment (“experiment phase”), were normalized to a starting density of OD600 = 0.02 in carbonless minimal medium. In the starter phase, glycerol stocks of the unlabeled and labeled strains were inoculated into 525 μl (0.8-ml-deep 96-well plate) of LB medium (25°C, 220 rpm, 16 hours). Inoculations from glycerol stocks were conducted via pin replicator [sterilized via 70% (v/v) ethanol bath and flame treatment between inoculations]. In the preculture phase, all cultures were washed in carbonless M9 medium two times and then diluted (1:50) into 1-ml M9 medium with 0.5% (w/v) glucose (25°C, 220 rpm, 24 hours). Last, the experimental phase began by washing cells three times in a carbonless M9 medium to remove residual glucose and normalizing to a starting OD600 of 0.02 (or ~20 cells per droplet depending on the strain).