Add support for genbank/refseq assemblies (#30)

* ncbi genomes computation * longer name and path * worker 0.2.7 * expose genomes in urls * add links to index
sourmash-bio · Jul 17, 2020 · 6b6958d · 6b6958d
1 parent c074b21
commit 6b6958d
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 52 deletions.
diff --git a/Dockerfile.worker b/Dockerfile.worker
@@ -1,6 +1,6 @@
 # Build decoct
 FROM rust:1-slim-buster as builder
-RUN cargo install --git https://github.com/luizirber/decoct --rev 96dee1d
+RUN cargo install --git https://github.com/luizirber/decoct --rev 2d3f980cbfd64b028adf34620cc33ff501814a6b
 
 # Build worker image
 FROM python:3.8.3-slim-buster
@@ -20,7 +20,7 @@ RUN apt-get update && \
     curl --output sratoolkit.tar.gz https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.7/sratoolkit.2.10.7-ubuntu64.tar.gz && \
     tar xf sratoolkit.tar.gz && \
     rm sratoolkit.tar.gz && \
-    apt-get remove -y curl build-essential libssl-dev && \
+    apt-get remove -y build-essential libssl-dev && \
     apt-get autoremove -y && \
     rm -rf /var/lib/apt && \
     pip uninstall -y micropipenv
@@ -53,4 +53,4 @@ COPY wort wort
 COPY config config
 
 ENV RAYON_NUM_THREADS 3
-CMD celery -A wort.blueprints.compute.tasks -Q compute_small,compute_medium --without-gossip --without-mingle --without-heartbeat -l INFO -c 1 worker
+CMD celery -A wort.blueprints.compute.tasks -Q compute_small,compute_medium,genomes --without-gossip --without-mingle --without-heartbeat -l INFO -c 1 worker
diff --git a/migrations/versions/93ff4335d83d_longer_name_and_url_for_datasets.py b/migrations/versions/93ff4335d83d_longer_name_and_url_for_datasets.py
@@ -0,0 +1,30 @@
+"""longer name and url for datasets
+
+Revision ID: 93ff4335d83d
+Revises: 96052cfeea9b
+Create Date: 2020-07-17 00:36:42.235949
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '93ff4335d83d'
+down_revision = '96052cfeea9b'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('dataset', sa.Column('name', sa.String(length=160), nullable=True))
+    op.add_column('dataset', sa.Column('path', sa.String(length=340), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('dataset', 'path')
+    op.drop_column('dataset', 'name')
+    # ### end Alembic commands ###
diff --git a/wort/api.yaml b/wort/api.yaml
@@ -27,6 +27,18 @@ paths:
         '202':
           description: Compute task accepted
 
+  '/compute/genomes/{assembly_accession}':
+    post:
+      summary: Request to compute a new signature from a GenBank/RefSeq dataset
+      operationId: wort.blueprints.compute.views.compute_genomes
+      security:
+        - token: []
+      parameters:
+        - $ref: '#/components/parameters/assembly_accession'
+      responses:
+        '202':
+          description: Compute task accepted
+
   '/view/{public_db}/{dataset_id}':
     get:
       summary: Return a signature
@@ -95,7 +107,16 @@ components:
       required: true
       schema:
         type: string
-        pattern: '\w{3}\d{6,8}'
+        pattern: '^\w{3}\d{6,8}$'
+
+    assembly_accession:
+      name: assembly_accession
+      description: Accession number for a GenBank/RefSeq dataset
+      in: path
+      required: true
+      schema:
+        type: string
+        pattern: '^\w{3}_\d{9}\.\d{1,2}$'
 
     public_db:
       name: public_db
@@ -104,7 +125,7 @@ components:
       required: true
       schema:
         type: string
-        pattern: sra|img
+        pattern: "^sra|^img|^genomes"
 
     dataset_id:
       name: dataset_id

diff --git a/wort/blueprints/compute/tasks.py b/wort/blueprints/compute/tasks.py
@@ -70,16 +70,37 @@ def compute(sra_id):
 
 
 @celery.task
-def compute_syrah(sra_id):
+def compute_genomes(accession, path, name):
+    import boto3
+    import botocore
     from snakemake import shell
 
-    with NamedTemporaryFile("w+t") as f:
+    conn = boto3.client("s3")
+    s3 = boto3.resource("s3")
+
+    key_path = os.path.join("sigs", accession + ".sig")
+    try:
+        s3.Object("wort-genomes", key_path).load()
+    except botocore.exceptions.ClientError as e:
+        if e.response["Error"]["Code"] == "404":
+            pass  # Object does not exist, let's compute it later
+        else:
+            # Something else has gone wrong
+            raise
+
+    else:
+        # The key already exists
+        return
+
+    with NamedTemporaryFile("w+b") as f:
         try:
             shell(
-                "fastq-dump -A {sra_id} -Z | syrah | "
-                "sourmash compute -k 21 --dna - -o {output} --name {sra_id}".format(
-                    sra_id=sra_id, output=f.name
-                )
+               f"sourmash compute -k 21,31,51 "
+                "  --scaled 1000 "
+                "  --track-abundance "
+                "  --name {name:q} "
+                "  -o {f.name} "
+                "  <(curl {path})"
             )
         except CalledProcessError as e:
             # We ignore SIGPIPE, since it is informational (and makes sense,
@@ -90,42 +111,15 @@ def compute_syrah(sra_id):
                 raise e
 
         f.seek(0)
-        return f.read()
 
+        compressed_fp = BytesIO()
+        with gzip.GzipFile(fileobj=compressed_fp, mode="wb") as gz:
+            shutil.copyfileobj(f, gz)
 
-@celery.task(bind=True, ignore_result=True)
-def compute_syrah_to_s3(self, sra_id):
-    from boto.s3.connection import S3Connection
-    from boto.s3.key import Key
-    from snakemake import shell
-
-    conn = S3Connection()
-    bucket = conn.get_bucket("soursigs-done")
-
-    # Check if file is already on S3
-    key = bucket.get_key(os.path.join("sigs", sra_id))
-    if key is None:  # result not available yet, compute it
-        with NamedTemporaryFile("w+t") as f:
-            try:
-                shell(
-                    "fastq-dump -A {sra_id} -Z | syrah | "
-                    "sourmash compute -k 21 --dna - -o {output} --name {sra_id}".format(
-                        sra_id=sra_id, output=f.name
-                    )
-                )
-            except CalledProcessError as e:
-                # We ignore SIGPIPE, since it is informational (and makes sense,
-                # it happens because `head` is closed and `fastq-dump` can't pipe
-                # its output anymore. More details:
-                # http://www.pixelbeat.org/programming/sigpipe_handling.html
-                if e.returncode != 141:
-                    # TODO: save error to bucket, on 'errors/{sra_id}'?
-                    raise e
-
-            # save to S3
-            k = Key(bucket)
-            k.key = os.path.join("sigs", sra_id)
-            f.seek(0)
-            k.set_contents_from_string(f.read())
-
-            raise Ignore()
+        conn.put_object(
+            Body=compressed_fp.getvalue(),
+            Bucket="wort-genomes",
+            Key=key_path,
+            ContentType="application/json",
+            ContentEncoding="gzip",
+        )
diff --git a/wort/blueprints/compute/views.py b/wort/blueprints/compute/views.py
@@ -43,3 +43,31 @@ def compute_sra(sra_id):
 
     task = tasks.compute.apply_async(args=[sra_id], queue=queue)
     return jsonify({"status": "Submitted", "task_id": task.id}), 202
+
+
+def compute_genomes(assembly_accession):
+    from . import tasks
+
+    dataset = Dataset.query.filter_by(id=assembly_accession).first()
+    if dataset is None:
+        # We don't have information about it, how to query GenBank/RefSeq for
+        # info?
+        return jsonify({"status": "Metadata not available"}), 404
+
+    is_computed = dataset.ipfs is not None
+    if is_computed:
+        return jsonify({"status": "Signature already calculated"}), 202
+
+    # Not computed yet, send to proper queue
+    if dataset.size_MB <= 300:
+        queue = "compute_small"
+    elif dataset.size_MB > 300 and dataset.size_MB < 1600:
+        queue = "compute_medium"
+    else:
+        queue = "compute_large"
+
+    task = tasks.compute_genomes.apply_async(
+        args=[dataset.id, dataset.path, dataset.name],
+        queue="genomes")
+#        queue=queue)
+    return jsonify({"status": "Submitted", "task_id": task.id}), 202
diff --git a/wort/blueprints/viewer/views.py b/wort/blueprints/viewer/views.py
@@ -6,7 +6,7 @@
 # @viewer.route("/view/<db>/<dataset_id>")
 def view_s3(public_db, dataset_id):
 
-    if public_db not in ("sra", "img"):
+    if public_db not in ("sra", "img", "genomes"):
         return "Database not supported", 404
 
     import boto3

diff --git a/wort/models.py b/wort/models.py
@@ -71,3 +71,5 @@ class Dataset(db.Model):
     database_id = db.Column(db.String(20), db.ForeignKey("database.id"))
     size_MB = db.Column(db.Integer, nullable=True)
     ipfs = db.Column(db.String(60), nullable=True)
+    path = db.Column(db.String(340), nullable=True)
+    name = db.Column(db.String(160), nullable=True)
diff --git a/wort/templates/index.html b/wort/templates/index.html
@@ -3,9 +3,12 @@
 
 <div>
 Welcome to wort! It's a database for sourmash signatures, currently focused on
-indexing microbial datasets from
-<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?'>NCBI Sequence Read Archive</a>
-and the <a href='https://img.jgi.doe.gov/'>JGI Integrated Microbial Genomes and Microbiomes</a> portals.
+indexing datasets from the
+<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?'>NCBI Sequence Read Archive</a>,
+the <a href='https://img.jgi.doe.gov/'>JGI Integrated Microbial Genomes and Microbiomes</a> portal,
+and <a href='https://www.ncbi.nlm.nih.gov/assembly/'>NCBI Assembly</a> resources
+(<a href='https://www.ncbi.nlm.nih.gov/genbank'>GenBank</a> and
+<a href='https://www.ncbi.nlm.nih.gov/refseq/'>RefSeq</a>).
 </div>
 
 <div>
@@ -18,6 +21,7 @@
 <ul>
   <li>SRA: <a href='/view/sra/DRR013902/'>DRR013902</a></li>
   <li>IMG: <a href='/view/img/2522125045'>2728369338</a></li>
+  <li>NCBI Assemblies: <a href='/view/genomes/GCF_000246355.1'>GCF_000246355.1</a></li>
 </ul>
 </p>
 </div>