Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Building databases from assembly_summary.txt #11

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# sourmash databases

## Setup

All processing is performed using the conda environment specified in `environment.yml`.
To build and activate this environment run:

```bash
conda env create --force --file environment.yml

conda activate sourmash_databases
```
240 changes: 240 additions & 0 deletions Snakefile.assembly
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import os.path
import random

configfile: "config.assembly.yml"

rule all:
input:
expand("outputs/sbt/{db}-{domain}-x{bfsize}-k{ksize}.sbt.zip", db=config['db'], ksize=config['db_ksizes'], domain=config['domains'], bfsize=config['bfsize']),
expand("outputs/lca/{db}-{domain}-k{ksize}-scaled10k.lca.json.gz", db=config['db'], domain=config['domains'], ksize=config['db_ksizes']),
expand("outputs/taxid4index/{db}-{domain}.csv", db=config['db'], domain=config['domains'])


def sigs_in_catalog(w):
with checkpoints.catalog.get(domain=w.domain, db=w.db).output[0].open('r') as f:
return [l.strip() for l in f.readlines()]


rule sbt:
output: "outputs/sbt/{db}-{domain}-x{bfsize}-k{ksize}.sbt.zip"
input:
catalog = "outputs/catalog/{domain}/{db}.txt",
sigs = sigs_in_catalog
params:
ksize="{ksize}",
db="{db}",
domain="{domain}",
bfsize="{bfsize}",
benchmark: "benchmark/sbt/{db}-{domain}-x{bfsize}-k{ksize}.txt"
shell: """
sourmash index \
-k {params.ksize} \
-x {params.bfsize} \
--from-file <(tail +2 {input.catalog}) \
{output} $(head -1 {input.catalog})
"""


rule lca:
output:
lca = "outputs/lca/{db}-{domain}-k{ksize}-scaled10k.lca.json.gz",
input:
lineage = "outputs/lca/lineages/{domain}_{db}_lineage.csv",
catalog = "outputs/catalog/{domain}/{db}.txt",
sigs = sigs_in_catalog
params:
ksize="{ksize}",
db="{db}",
domain="{domain}",
benchmark: "benchmark/lca/{db}-{domain}-k{ksize}.txt"
shell: """
mkdir -p outputs/lca/reports/

sourmash lca index \
-k {params.ksize} \
--scaled 10000 \
--report outputs/lca/reports/{params.db}-{params.domain}-k{params.ksize}-scaled10k.txt \
-f -C 3 --split-identifiers \
--from-file <(tail +2 {input.catalog}) \
{input.lineage} \
{output.lca} $(head -1 {input.catalog})
"""

## Rules for building signature catalogs for each index

rule download_catalog:
output:
catalog="outputs/assembly_stats/{domain}/assembly_summary_{db}.txt.gz",
generated="outputs/assembly_stats/{domain}/assembly_summary_{db}.timestamp",
params:
domain="{domain}",
db="{db}"
shell: """
wget --header='Accept-Encoding: gzip' -O {output.catalog} https://ftp.ncbi.nlm.nih.gov/genomes/{params.db}/{params.domain}/assembly_summary.txt
date +%Y%m%d > {output.generated}
"""

checkpoint catalog:
output: "outputs/catalog/{domain}/{db}.txt"
input: "outputs/assembly_stats/{domain}/assembly_summary_{db}.txt.gz"
params:
domain = "{domain}"
run:
import csv
import gzip

basedir = config['sig_store']

with gzip.open(input[0], 'rt') as fp:
fp.readline() # skip first line
fp.read(2) # skip initial comment in header
data = csv.DictReader(fp, delimiter='\t')

with open(output[0], 'w') as fout:
for row in data:
accession = row['assembly_accession']
path = f"{basedir}/{accession}.sig"
if not os.path.exists(path):
# check if we can download and compute it
try:
url = url_for_accession(accession)
except ValueError:
# TODO: log this somehow
print(f"can't find URL for {accession}")

# TODO: if we decide an older version should be used
# then logic needs to be implemented here
continue

# if URL is valid, let's calculate locally
path = f"outputs/sigs/{params.domain}/{accession}.sig"

# either sig exists in sig_store, or can be computed
fout.write(path + '\n')


## Rules for computing signatures not found in config['sig_store']

def url_for_accession(accession):
db, acc = accession.split("_")
number, version = acc.split(".")
number = "/".join([number[pos:pos + 3] for pos in range(0, len(number), 3)])
url = f"ftp://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{number}"

from subprocess import CalledProcessError
try:
all_names = shell(f"curl -s -l {url}/", read=True).split('\n')
except CalledProcessError as e:
# TODO: might check here if it was a 404 or 5xx, assuming 404
raise ValueError(f"Can't find URL for {accession}, tried {url}")

full_name = None
for name in all_names:
db_, acc_, *_ = name.split("_")
if db_ == db and acc == acc_:
full_name = name
break

url = "https" + url[3:]
return f"{url}/{full_name}/{full_name}_genomic.fna.gz"


def name_for_accession(domain, accession):
import csv
import gzip

if accession[:3] == "GCA":
db = "genbank"
else:
db = "refseq"
summary = f"outputs/assembly_stats/{domain}/assembly_summary_{db}.txt.gz"

with gzip.open(summary, 'rt') as fp:
fp.readline() # skip first line
fp.read(2) # skip initial comment in header
data = csv.DictReader(fp, delimiter='\t')
for row in data:
if row['assembly_accession'] == accession:
name_parts = [row["assembly_accession"], " ", row['organism_name']]
if row['infraspecific_name']:
name_parts += [" ", row['infraspecific_name']]
name_parts += [', ', row['asm_name']]
return "".join(name_parts)
# If we get here, then the accession wasn't in the summary.
raise ValueError("Accession not in this summary file!")


def choose_summary(w):
if w.accession.startswith("GCF"):
return f"outputs/assembly_stats/{w.domain}/assembly_summary_refseq.txt.gz",
elif w.accession.startswith("GCA"):
return f"outputs/assembly_stats/{w.domain}/assembly_summary_genbank.txt.gz",

raise ValueError(f"Couldn't decide assembly summary with {w.accession}")


rule compute:
output: "outputs/sigs/{domain}/{accession}.sig"
input: choose_summary
params:
ksizes=lambda w: ",".join(str(k) for k in config["db_ksizes"]),
scaled=1000,
name=lambda w: name_for_accession(w.domain, w.accession),
url_path=lambda w: url_for_accession(w.accession),
shell: """
sourmash compute -k {params.ksizes} \
--scaled {params.scaled} \
--track-abundance \
--name {params.name:q} \
-o {output} \
<(curl -s {params.url_path} | zcat)
"""

## TODO: write rule to validate SBTs
# - check that all sigs in catalog are in the SBT too
# - maybe run some searches?

### Prepare LCA lineages

rule download_lca_scripts:
output:
make_lineage_csv = "scripts/make-lineage-csv.py",
taxdump_utils = "scripts/ncbi_taxdump_utils.py",
shell: """
wget -qO {output.taxdump_utils} https://raw.githubusercontent.com/dib-lab/2018-ncbi-lineages/63e8dc784af092293362f2e8e671ae03d1a84d1d/ncbi_taxdump_utils.py
wget -qO {output.make_lineage_csv} https://raw.githubusercontent.com/dib-lab/2018-ncbi-lineages/63e8dc784af092293362f2e8e671ae03d1a84d1d/make-lineage-csv.py
chmod +x {output.make_lineage_csv}
"""

rule lca_lineage_csv:
output: "outputs/lca/lineages/{domain}_{db}_lineage.csv",
input:
summary = "outputs/assembly_stats/{domain}/assembly_summary_{db}.txt.gz",
make_acc_taxid_mapping = "scripts/make-acc-taxid-mapping.py",
make_lineage_csv = "scripts/make-lineage-csv.py",
taxonomy = expand("outputs/taxonomy/{file}.dmp", file=('nodes', 'names')),
shell: """
{input.make_lineage_csv} {input.taxonomy} \
<({input.make_acc_taxid_mapping} --strip-version {input.summary}) -o {output}
"""

rule taxid4index:
output: "outputs/taxid4index/{db}-{domain}.csv",
input:
summary = "outputs/assembly_stats/{domain}/assembly_summary_{db}.txt.gz",
make_acc_taxid_mapping = "scripts/make-acc-taxid-mapping.py",
shell: """
{input.make_acc_taxid_mapping} {input.summary} > {output}
"""

rule download_taxonomy:
output:
names = 'outputs/taxonomy/names.dmp',
nodes = 'outputs/taxonomy/nodes.dmp',
generated="outputs/taxonomy/timestamp",
shell: """
(cd outputs/taxonomy && \
wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz && \
tar xf taxdump.tar.gz)
date +%Y%m%d > {output.generated}
"""
34 changes: 34 additions & 0 deletions config.assembly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
domains:
- fungi
- archaea
- bacteria
- viral
#- invertebrate
#- plant
- protozoa
#- vertebrate_mammalian
#- vertebrate_other
## only for genbank
#- other
## does it even make sense to make an SBT of the genbank metagenomes?
#- metagenomes

# top level directory for precomputed signatures
sig_store: /data/wort/wort-genomes/sigs

# ksizes to build the databases for
db_ksizes:
- 21
- 31
- 51

# options: genbank, refseq
db:
- refseq
- genbank

# bloom filter sizes for SBTs
bfsize:
- 1e6
- 1e5
- 1e4
10 changes: 10 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: sourmash_databases
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3.7
- snakemake-minimal=5.20.1
- sourmash-minimal>=3.4,<4
- curl
33 changes: 33 additions & 0 deletions scripts/make-acc-taxid-mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#! /usr/bin/env python
"""
Take an NCBI 'assembly_summary.txt' file and print to
stdout the accessions and their associated taxids.
"""

import argparse
import csv
import gzip


def main():
p = argparse.ArgumentParser()
p.add_argument('--strip-version', action='store_true')
p.add_argument('summary')
args = p.parse_args()

with gzip.open(args.summary, 'rt') as fp:
fp.readline() # skip first line
fp.read(2) # skip initial comment in header
data = csv.DictReader(fp, delimiter='\t')
for row in data:
accession = row['assembly_accession']
if args.strip_version:
# --split-identifiers in `sourmash lca index` doesn't behave
# well with version, so remove it
accession = accession.split('.')[0]
taxid = row['taxid']
print(f'{accession},{taxid}')


if __name__ == '__main__':
main()