Merge pull request #71 from genomehubs:rjchallis/issue44

rjchallis/issue44
genomehubs · Apr 9, 2021 · 7eed9ff · 7eed9ff
2 parents f8897a0 + 4f36507
commit 7eed9ff
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 19 deletions.
diff --git a/src/genomehubs/lib/attributes.py b/src/genomehubs/lib/attributes.py
@@ -37,18 +37,63 @@ def index(es, group, attributes, opts, *, index_type="attribute"):
     return template, stream
 
 
+# def fetch_types(es, opts):
+#     """Fetch all existing types."""
+#     template = index_template(opts, index_type="attribute")
+#     body = {
+#         "id": "attribute_types",
+#         "params": {},
+#     }
+#     entries = stream_template_search_results(
+#         es, index=template["index_name"], body=body
+#     )
+#     return {entry["key"]: entry for entry in entries}
+
+
+def add_attribute_sources(name, obj, attributes):
+    """Generate a list of attribute sources."""
+    for key, value in attributes[name].items():
+        if key.startswith("source"):
+            if key in obj:
+                if not isinstance(obj[key], list):
+                    obj[key] = [obj[key]]
+                obj[key].append(value)
+            else:
+                obj[key] = value
+
+
 def index_types(es, types_name, types, opts):
     """Index types into Elasticsearch."""
+    # TODO: fetch existing types to allow new sources to add, not overwrite
+    try:
+        attributes = fetch_types(es, types_name, opts)
+    except Exception:
+        attributes = {}
     if "attributes" in types:
-        if "defaults" in types and "attributes" in types["defaults"]:
-            for key, value in types["attributes"].items():
+        new_attributes = {}
+        existing_attributes = {}
+        for key, value in types["attributes"].items():
+            if "defaults" in types and "attributes" in types["defaults"]:
                 value = {**types["defaults"]["attributes"], **value}
-                types["attributes"][key] = value
+                # types["attributes"][key] = value
+            if key in attributes:
+                existing_attributes[key] = value
+                add_attribute_sources(key, value, attributes)
+            else:
+                new_attributes[key] = value
         template, stream = index(
-            es, types_name, types["attributes"], opts, index_type="attribute"
+            es, types_name, new_attributes, opts, index_type="attribute"
+        )
+        template, update_stream = index(
+            es,
+            types_name,
+            existing_attributes,
+            opts,
+            index_type="attribute",
         )
         load_mapping(es, template["name"], template["mapping"])
         index_stream(es, template["index_name"], stream)
+        index_stream(es, template["index_name"], update_stream, _op_type="update")
     if "taxon_names" in types:
         if "defaults" in types and "taxon_names" in types["defaults"]:
             for key, value in types["names"].items():

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
@@ -356,7 +356,9 @@ def add_attributes(
                 else:
                     attribute = {"identifier": validated, "class": key}
                 attribute.update(meta)
-                if source is not None:
+                if "source" in types[key]:
+                    attribute.update({"source": types[key]["source"]})
+                elif source is not None:
                     attribute.update({"source": source})
                 attributes.append(attribute)
     if attribute_values:
@@ -470,6 +472,7 @@ def validate_types_file(types_file, dir_path):
         if key.startswith("display") or key.startswith("taxon"):
             defaults["attributes"].update({key: value})
         elif key.startswith("source"):
+            defaults["attributes"].update({key: value})
             defaults["metadata"].update({key: value})
     types.update({"defaults": defaults})
     data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"])
@@ -550,9 +553,12 @@ def process_row(types, names, row):
     taxon_data = {}
     taxon_types = {}
     if "is_primary_value" in data["metadata"]:
-        data["metadata"]["is_primary_value"] = bool(
-            int(data["metadata"]["is_primary_value"])
-        )
+        try:
+            data["metadata"]["is_primary_value"] = bool(
+                int(data["metadata"]["is_primary_value"])
+            )
+        except ValueError:
+            data["metadata"]["is_primary_value"] = False
     for attr_type in list(["attributes", "identifiers"]):
         if attr_type in data and data[attr_type]:
             (

diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py
@@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts):
     return parsed
 
 
-def refseq_organelle_parser(collections, opts, *args, **kwargs):
+def refseq_organelle_parser(collections, opts):
     """Fetch and parse RefSeq organelle collections."""
     parsed = []
     if isinstance(collections, tuple):
@@ -189,8 +189,8 @@ def refseq_organelle_parser(collections, opts, *args, **kwargs):
 def parse_ncbi_datasets_record(record, parsed):
     """Parse a single NCBI datasets record."""
     obj = {}
-    for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
-        obj[key] = record.get(key, None)
+    for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
+        obj[key] = record.get(key, "None")
     assemblyInfo = record.get("assemblyInfo", {})
     for key in (
         "assemblyLevel",
@@ -204,9 +204,15 @@ def parse_ncbi_datasets_record(record, parsed):
         "submitter",
     ):
         obj[key] = assemblyInfo.get(key, None)
+        if key == "refseqCategory":
+            if obj[key] == "representative genome":
+                obj["primaryValue"] = 1
+            else:
+                obj["primaryValue"] = None
     if obj["refseqAssmAccession"] == "na":
         obj["refseqAssmAccession"] = None
         obj["refseqCategory"] = None
+        obj["primaryValue"] = None
     annotationInfo = record.get("annotationInfo", {})
     if annotationInfo:
         annot = {}
@@ -233,13 +239,77 @@ def parse_ncbi_datasets_record(record, parsed):
     parsed[obj["genbankAssmAccession"]] = obj
 
 
-def ncbi_genome_parser(directory, opts, *args, **kwargs):
+def ncbi_genome_parser(_params, opts, *, types=None, names=None):
     """Parse NCBI Datasets genome report."""
     parsed = {}
     with tofile.open_file_handle(
-        "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
+        "%s/ncbi_dataset/data/assembly_data_report.jsonl" % opts["ncbi-datasets-genome"]
     ) as report:
         for line in report:
             record = ujson.loads(line)
             parse_ncbi_datasets_record(record, parsed)
     return [value for value in parsed.values()]
+
+
+# def parse_ncbi_datasets_summary(record, parsed):
+#     """Parse a single NCBI datasets summary."""
+#     obj = {}
+#     return
+#     for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
+#         obj[key] = record.get(key, None)
+#     assemblyInfo = record.get("assemblyInfo", {})
+#     for key in ("assembly_category", "assembly_level"):
+#         obj[key] = assemblyInfo.get(key, None)
+#     # "assembly_accession": "GCF_900239965.1",
+#     if obj["refseqAssmAccession"] == "na":
+#         obj["refseqAssmAccession"] = None
+#         obj["refseqCategory"] = None
+#     annotationInfo = record.get("annotationInfo", {})
+#     if annotationInfo:
+#         annot = {}
+#         for key in ("name", "releaseDate", "reportUrl", "source"):
+#             annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
+#         if annot and "stats" in annotationInfo:
+#             geneCounts = annotationInfo["stats"].get("geneCounts", None)
+#             for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
+#                 annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
+#             if obj["genbankAssmAccession"] in parsed:
+#                 parsed[obj["genbankAssmAccession"]].update(annot)
+#                 return
+#             obj.update(annot)
+#     bioprojects = []
+#     for lineage in assemblyInfo.get("bioprojectLineage", []):
+#         for bioproject in lineage["bioprojects"]:
+#             bioprojects.append(bioproject["accession"])
+#     obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
+#     assemblyStats = record.get("assemblyStats", {})
+#     obj.update(assemblyStats)
+#     wgsInfo = record.get("wgsInfo", {})
+#     for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
+#         obj[key] = wgsInfo.get(key, None)
+#     parsed[obj["genbankAssmAccession"]] = obj
+
+
+# def ncbi_datasets_summary_parser(_params, opts):
+#     """Fetch and parse NCBI Datasets summary."""
+#     parsed = {}
+#     datasets = check_output(
+#         ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
+#     )
+#     data = ujson.loads(datasets)
+#     if "assemblies" not in data:
+#         LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
+#         print(data)
+#         sys.exit(1)
+#     for record in data["assemblies"]:
+#         parse_ncbi_datasets_summary(record, parsed)
+#     print(parsed)
+#     quit()
+#     # parsed = {}
+#     # with tofile.open_file_handle(
+#     #     "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
+#     # ) as report:
+#     #     for line in report:
+#     #         record = ujson.loads(line)
+#     #         parse_ncbi_datasets_record(record, parsed)
+#     return [value for value in parsed.values()]
diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py
@@ -46,6 +46,8 @@
 from .config import config
 from .hub import load_types
 from .hub import order_parsed_fields
+
+# from .ncbi import ncbi_datasets_summary_parser
 from .ncbi import ncbi_genome_parser
 from .ncbi import refseq_organelle_parser
 from .version import __version__
@@ -60,6 +62,11 @@
         "params": None,
         "types": "assembly",
     },
+    # "ncbi-datasets-summary": {
+    #     "func": ncbi_datasets_summary_parser,
+    #     "params": None,
+    #     "types": "assembly",
+    # },
     "refseq-mitochondria": {
         "func": refseq_organelle_parser,
         "params": ("mitochondrion"),

diff --git a/src/genomehubs/templates/assembly.types.yaml b/src/genomehubs/templates/assembly.types.yaml
@@ -8,8 +8,8 @@ file:
 taxonomy:
   taxon_id:
     header: taxId
-  species:
-    header: speciesName
+  taxon:
+    header: organismName
 names:
   common_name:
     header: commonName
@@ -63,7 +63,11 @@ attributes:
     taxon_display_group: assembly
     taxon_name: gene_count
     taxon_key: gene_count
-    taxon_summary: median
+    taxon_summary:
+      - primary
+      - median
+      - min
+      - max
     taxon_display_level: 2
     taxon_bins:
       min: 0
@@ -103,7 +107,9 @@ attributes:
     taxon_name: sample_sex
     taxon_key: sample_sex
     taxon_display_name: Sample sex
-    taxon_summary: list
+    taxon_summary:
+      - primary
+      - list
     taxon_display_level: 2
   isolate:
     display_level: 2
@@ -115,6 +121,14 @@ attributes:
     header: assemblyLevel
     display_name: Assembly level
     type: keyword
+    taxon_display_group: assembly
+    taxon_name: assembly_level
+    taxon_key: assembly_level
+    taxon_display_name: Assembly level
+    taxon_summary:
+      - primary
+      - list
+    taxon_display_level: 2
   assembly_type:
     display_level: 2
     header: assemblyType
@@ -133,10 +147,13 @@ attributes:
     taxon_name: assembly_span
     taxon_key: assembly_span
     taxon_display_name: Assembly span
-    taxon_summary: median
+    taxon_summary:
+      - primary
+      - median
+      - min
+      - max
     taxon_traverse: median
     taxon_traverse_direction: both
-    taxon_traverse_limit: superkingdom
     taxon_display_level: 1
     taxon_bins:
       min: 6
@@ -161,6 +178,22 @@ attributes:
     header: contigN50
     type: long
     units: bases
+    taxon_display_group: assembly
+    taxon_name: contig_n50
+    taxon_key: contig_n50
+    taxon_summary:
+      - primary
+      - median
+      - min
+      - max
+    taxon_traverse: median
+    taxon_traverse_direction: both
+    taxon_display_level: 2
+    taxon_bins:
+      min: 4
+      max: 9
+      count: 10
+      scale: log10
   contig_l50:
     display_group: metrics
     display_level: 2
@@ -179,6 +212,22 @@ attributes:
     header: scaffoldN50
     type: long
     units: bases
+    taxon_display_group: assembly
+    taxon_name: scaffold_n50
+    taxon_key: scaffold_n50
+    taxon_summary:
+      - primary
+      - median
+      - min
+      - max
+    taxon_traverse: median
+    taxon_traverse_direction: both
+    taxon_display_level: 2
+    taxon_bins:
+      min: 4
+      max: 9
+      count: 10
+      scale: log10
   scaffold_l50:
     display_group: metrics
     display_level: 2
@@ -242,3 +291,5 @@ attributes:
 metadata:
   source_slug:
     header: genbankAssmAccession
+  is_primary_value:
+    header: primaryValue