Skip to content

Commit

Permalink
Add support for genbank/refseq assemblies (#30)
Browse files Browse the repository at this point in the history
* ncbi genomes computation
* longer name and path
* worker 0.2.7
* expose genomes in urls
* add links to index
  • Loading branch information
luizirber committed Jul 17, 2020
1 parent c074b21 commit 6b6958d
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 52 deletions.
6 changes: 3 additions & 3 deletions Dockerfile.worker
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Build decoct
FROM rust:1-slim-buster as builder
RUN cargo install --git https://github.com/luizirber/decoct --rev 96dee1d
RUN cargo install --git https://github.com/luizirber/decoct --rev 2d3f980cbfd64b028adf34620cc33ff501814a6b

# Build worker image
FROM python:3.8.3-slim-buster
Expand All @@ -20,7 +20,7 @@ RUN apt-get update && \
curl --output sratoolkit.tar.gz https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.7/sratoolkit.2.10.7-ubuntu64.tar.gz && \
tar xf sratoolkit.tar.gz && \
rm sratoolkit.tar.gz && \
apt-get remove -y curl build-essential libssl-dev && \
apt-get remove -y build-essential libssl-dev && \
apt-get autoremove -y && \
rm -rf /var/lib/apt && \
pip uninstall -y micropipenv
Expand Down Expand Up @@ -53,4 +53,4 @@ COPY wort wort
COPY config config

ENV RAYON_NUM_THREADS 3
CMD celery -A wort.blueprints.compute.tasks -Q compute_small,compute_medium --without-gossip --without-mingle --without-heartbeat -l INFO -c 1 worker
CMD celery -A wort.blueprints.compute.tasks -Q compute_small,compute_medium,genomes --without-gossip --without-mingle --without-heartbeat -l INFO -c 1 worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""longer name and url for datasets
Revision ID: 93ff4335d83d
Revises: 96052cfeea9b
Create Date: 2020-07-17 00:36:42.235949
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '93ff4335d83d'
down_revision = '96052cfeea9b'
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('dataset', sa.Column('name', sa.String(length=160), nullable=True))
op.add_column('dataset', sa.Column('path', sa.String(length=340), nullable=True))
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('dataset', 'path')
op.drop_column('dataset', 'name')
# ### end Alembic commands ###
25 changes: 23 additions & 2 deletions wort/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ paths:
'202':
description: Compute task accepted

'/compute/genomes/{assembly_accession}':
post:
summary: Request to compute a new signature from a GenBank/RefSeq dataset
operationId: wort.blueprints.compute.views.compute_genomes
security:
- token: []
parameters:
- $ref: '#/components/parameters/assembly_accession'
responses:
'202':
description: Compute task accepted

'/view/{public_db}/{dataset_id}':
get:
summary: Return a signature
Expand Down Expand Up @@ -95,7 +107,16 @@ components:
required: true
schema:
type: string
pattern: '\w{3}\d{6,8}'
pattern: '^\w{3}\d{6,8}$'

assembly_accession:
name: assembly_accession
description: Accession number for a GenBank/RefSeq dataset
in: path
required: true
schema:
type: string
pattern: '^\w{3}_\d{9}\.\d{1,2}$'

public_db:
name: public_db
Expand All @@ -104,7 +125,7 @@ components:
required: true
schema:
type: string
pattern: sra|img
pattern: "^sra|^img|^genomes"

dataset_id:
name: dataset_id
Expand Down
80 changes: 37 additions & 43 deletions wort/blueprints/compute/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,37 @@ def compute(sra_id):


@celery.task
def compute_syrah(sra_id):
def compute_genomes(accession, path, name):
import boto3
import botocore
from snakemake import shell

with NamedTemporaryFile("w+t") as f:
conn = boto3.client("s3")
s3 = boto3.resource("s3")

key_path = os.path.join("sigs", accession + ".sig")
try:
s3.Object("wort-genomes", key_path).load()
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "404":
pass # Object does not exist, let's compute it later
else:
# Something else has gone wrong
raise

else:
# The key already exists
return

with NamedTemporaryFile("w+b") as f:
try:
shell(
"fastq-dump -A {sra_id} -Z | syrah | "
"sourmash compute -k 21 --dna - -o {output} --name {sra_id}".format(
sra_id=sra_id, output=f.name
)
f"sourmash compute -k 21,31,51 "
" --scaled 1000 "
" --track-abundance "
" --name {name:q} "
" -o {f.name} "
" <(curl {path})"
)
except CalledProcessError as e:
# We ignore SIGPIPE, since it is informational (and makes sense,
Expand All @@ -90,42 +111,15 @@ def compute_syrah(sra_id):
raise e

f.seek(0)
return f.read()

compressed_fp = BytesIO()
with gzip.GzipFile(fileobj=compressed_fp, mode="wb") as gz:
shutil.copyfileobj(f, gz)

@celery.task(bind=True, ignore_result=True)
def compute_syrah_to_s3(self, sra_id):
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from snakemake import shell

conn = S3Connection()
bucket = conn.get_bucket("soursigs-done")

# Check if file is already on S3
key = bucket.get_key(os.path.join("sigs", sra_id))
if key is None: # result not available yet, compute it
with NamedTemporaryFile("w+t") as f:
try:
shell(
"fastq-dump -A {sra_id} -Z | syrah | "
"sourmash compute -k 21 --dna - -o {output} --name {sra_id}".format(
sra_id=sra_id, output=f.name
)
)
except CalledProcessError as e:
# We ignore SIGPIPE, since it is informational (and makes sense,
# it happens because `head` is closed and `fastq-dump` can't pipe
# its output anymore. More details:
# http://www.pixelbeat.org/programming/sigpipe_handling.html
if e.returncode != 141:
# TODO: save error to bucket, on 'errors/{sra_id}'?
raise e

# save to S3
k = Key(bucket)
k.key = os.path.join("sigs", sra_id)
f.seek(0)
k.set_contents_from_string(f.read())

raise Ignore()
conn.put_object(
Body=compressed_fp.getvalue(),
Bucket="wort-genomes",
Key=key_path,
ContentType="application/json",
ContentEncoding="gzip",
)
28 changes: 28 additions & 0 deletions wort/blueprints/compute/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,31 @@ def compute_sra(sra_id):

task = tasks.compute.apply_async(args=[sra_id], queue=queue)
return jsonify({"status": "Submitted", "task_id": task.id}), 202


def compute_genomes(assembly_accession):
from . import tasks

dataset = Dataset.query.filter_by(id=assembly_accession).first()
if dataset is None:
# We don't have information about it, how to query GenBank/RefSeq for
# info?
return jsonify({"status": "Metadata not available"}), 404

is_computed = dataset.ipfs is not None
if is_computed:
return jsonify({"status": "Signature already calculated"}), 202

# Not computed yet, send to proper queue
if dataset.size_MB <= 300:
queue = "compute_small"
elif dataset.size_MB > 300 and dataset.size_MB < 1600:
queue = "compute_medium"
else:
queue = "compute_large"

task = tasks.compute_genomes.apply_async(
args=[dataset.id, dataset.path, dataset.name],
queue="genomes")
# queue=queue)
return jsonify({"status": "Submitted", "task_id": task.id}), 202
2 changes: 1 addition & 1 deletion wort/blueprints/viewer/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# @viewer.route("/view/<db>/<dataset_id>")
def view_s3(public_db, dataset_id):

if public_db not in ("sra", "img"):
if public_db not in ("sra", "img", "genomes"):
return "Database not supported", 404

import boto3
Expand Down
2 changes: 2 additions & 0 deletions wort/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ class Dataset(db.Model):
database_id = db.Column(db.String(20), db.ForeignKey("database.id"))
size_MB = db.Column(db.Integer, nullable=True)
ipfs = db.Column(db.String(60), nullable=True)
path = db.Column(db.String(340), nullable=True)
name = db.Column(db.String(160), nullable=True)
10 changes: 7 additions & 3 deletions wort/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

<div>
Welcome to wort! It's a database for sourmash signatures, currently focused on
indexing microbial datasets from
<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?'>NCBI Sequence Read Archive</a>
and the <a href='https://img.jgi.doe.gov/'>JGI Integrated Microbial Genomes and Microbiomes</a> portals.
indexing datasets from the
<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?'>NCBI Sequence Read Archive</a>,
the <a href='https://img.jgi.doe.gov/'>JGI Integrated Microbial Genomes and Microbiomes</a> portal,
and <a href='https://www.ncbi.nlm.nih.gov/assembly/'>NCBI Assembly</a> resources
(<a href='https://www.ncbi.nlm.nih.gov/genbank'>GenBank</a> and
<a href='https://www.ncbi.nlm.nih.gov/refseq/'>RefSeq</a>).
</div>

<div>
Expand All @@ -18,6 +21,7 @@
<ul>
<li>SRA: <a href='/view/sra/DRR013902/'>DRR013902</a></li>
<li>IMG: <a href='/view/img/2522125045'>2728369338</a></li>
<li>NCBI Assemblies: <a href='/view/genomes/GCF_000246355.1'>GCF_000246355.1</a></li>
</ul>
</p>
</div>
Expand Down

0 comments on commit 6b6958d

Please sign in to comment.