From 943cc278d29897de3d86748ffb538cf54f395457 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 3 Mar 2021 08:22:48 +0000
Subject: [PATCH 1/5] begin adding NCBI datasets summary parser

---
 src/genomehubs/lib/ncbi.py  | 66 +++++++++++++++++++++++++++++++++++++
 src/genomehubs/lib/parse.py |  8 +++++
 2 files changed, 74 insertions(+)

diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py
index 8ab16b1d..a2acac17 100644
--- a/src/genomehubs/lib/ncbi.py
+++ b/src/genomehubs/lib/ncbi.py
@@ -3,7 +3,9 @@
 
 import gzip
 import re
+import sys
 from collections import Counter
+from subprocess import check_output
 
 import ujson
 from Bio import SeqIO
@@ -243,3 +245,67 @@ def ncbi_genome_parser(directory, opts):
             record = ujson.loads(line)
             parse_ncbi_datasets_record(record, parsed)
     return [value for value in parsed.values()]
+
+
+def parse_ncbi_datasets_summary(record, parsed):
+    """Parse a single NCBI datasets summary."""
+    obj = {}
+    return
+    for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
+        obj[key] = record.get(key, None)
+    assemblyInfo = record.get("assemblyInfo", {})
+    for key in ("assembly_category", "assembly_level"):
+        obj[key] = assemblyInfo.get(key, None)
+    # "assembly_accession": "GCF_900239965.1",
+    if obj["refseqAssmAccession"] == "na":
+        obj["refseqAssmAccession"] = None
+        obj["refseqCategory"] = None
+    annotationInfo = record.get("annotationInfo", {})
+    if annotationInfo:
+        annot = {}
+        for key in ("name", "releaseDate", "reportUrl", "source"):
+            annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
+        if annot and "stats" in annotationInfo:
+            geneCounts = annotationInfo["stats"].get("geneCounts", None)
+            for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
+                annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
+            if obj["genbankAssmAccession"] in parsed:
+                parsed[obj["genbankAssmAccession"]].update(annot)
+                return
+            obj.update(annot)
+    bioprojects = []
+    for lineage in assemblyInfo.get("bioprojectLineage", []):
+        for bioproject in lineage["bioprojects"]:
+            bioprojects.append(bioproject["accession"])
+    obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
+    assemblyStats = record.get("assemblyStats", {})
+    obj.update(assemblyStats)
+    wgsInfo = record.get("wgsInfo", {})
+    for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
+        obj[key] = wgsInfo.get(key, None)
+    parsed[obj["genbankAssmAccession"]] = obj
+
+
+def ncbi_datasets_summary_parser(_params, opts):
+    """Fetch and parse NCBI Datasets summary."""
+    parsed = {}
+    datasets = check_output(
+        ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
+    )
+    data = ujson.loads(datasets)
+    if "assemblies" not in data:
+        LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
+        print(data)
+        sys.exit(1)
+    for record in data["assemblies"]:
+        parse_ncbi_datasets_summary(record, parsed)
+    print(parsed)
+    quit()
+    # parsed = {}
+    # with tofile.open_file_handle(
+    #     "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
+    # ) as report:
+    #     for line in report:
+    #         record = ujson.loads(line)
+    #         parse_ncbi_datasets_record(record, parsed)
+    return [value for value in parsed.values()]
diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py
index cbef1219..43e47401 100644
--- a/src/genomehubs/lib/parse.py
+++ b/src/genomehubs/lib/parse.py
@@ -7,6 +7,7 @@
     genomehubs parse [--btk] [--btk-root STRING...]
                      [--wikidata PATH] [--wikidata-root STRING...] [--wikidata-xref STRING...]
                      [--gbif] [--gbif-root STRING...] [--gbif-xref STRING...]
+                     [--ncbi-datasets-summary INT]
                      [--ncbi-datasets-genome PATH] [--outfile PATH]
                      [--refseq-mitochondria] [--refseq-organelles]
                      [--refseq-plastids] [--refseq-root NAME]
@@ -21,6 +22,7 @@
     --wikidata PATH              Parse taxa in WikiData dump
     --wikidata-root STRING       WikiData taxon ID of root taxon
     --wikidata-xref STRING       Include link to external reference from WikiData (e.g. NBN, BOLD)
+    --ncbi-datasets-summary INT  Fetch and parse NCBI Datasets summary for a root taxId
     --ncbi-datasets-genome PATH  Parse NCBI Datasets genome directory
     --outfile PATH               Save parsed output to file
     --refseq-mitochondria        Parse mitochondrial genomes from the NCBI RefSeq
@@ -47,6 +49,7 @@
 from .gbif import gbif_parser
 from .hub import load_types
 from .hub import order_parsed_fields
+from .ncbi import ncbi_datasets_summary_parser
 from .ncbi import ncbi_genome_parser
 from .ncbi import refseq_organelle_parser
 from .version import __version__
@@ -62,6 +65,11 @@
         "params": None,
         "types": "assembly",
     },
+    "ncbi-datasets-summary": {
+        "func": ncbi_datasets_summary_parser,
+        "params": None,
+        "types": "assembly",
+    },
     "refseq-mitochondria": {
         "func": refseq_organelle_parser,
         "params": ("mitochondrion"),

From b03e3cac75edeb3d4141cf940acb9c2b7244c39d Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Thu, 8 Apr 2021 13:48:08 +0100
Subject: [PATCH 2/5] update default ncbi datasets config

---
 src/genomehubs/lib/ncbi.py                   | 124 +++++++++----------
 src/genomehubs/lib/parse.py                  |  15 ++-
 src/genomehubs/templates/assembly.types.yaml |  40 +++++-
 3 files changed, 104 insertions(+), 75 deletions(-)

diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py
index a2acac17..f8c0589a 100644
--- a/src/genomehubs/lib/ncbi.py
+++ b/src/genomehubs/lib/ncbi.py
@@ -3,9 +3,7 @@
 
 import gzip
 import re
-import sys
 from collections import Counter
-from subprocess import check_output
 
 import ujson
 from Bio import SeqIO
@@ -191,7 +189,7 @@ def refseq_organelle_parser(collections, opts):
 def parse_ncbi_datasets_record(record, parsed):
     """Parse a single NCBI datasets record."""
     obj = {}
-    for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
+    for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
         obj[key] = record.get(key, None)
     assemblyInfo = record.get("assemblyInfo", {})
     for key in (
@@ -247,65 +245,65 @@ def ncbi_genome_parser(directory, opts):
     return [value for value in parsed.values()]
 
 
-def parse_ncbi_datasets_summary(record, parsed):
-    """Parse a single NCBI datasets summary."""
-    obj = {}
-    return
-    for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
-        obj[key] = record.get(key, None)
-    assemblyInfo = record.get("assemblyInfo", {})
-    for key in ("assembly_category", "assembly_level"):
-        obj[key] = assemblyInfo.get(key, None)
-    # "assembly_accession": "GCF_900239965.1",
-    if obj["refseqAssmAccession"] == "na":
-        obj["refseqAssmAccession"] = None
-        obj["refseqCategory"] = None
-    annotationInfo = record.get("annotationInfo", {})
-    if annotationInfo:
-        annot = {}
-        for key in ("name", "releaseDate", "reportUrl", "source"):
-            annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
-        if annot and "stats" in annotationInfo:
-            geneCounts = annotationInfo["stats"].get("geneCounts", None)
-            for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
-                annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
-            if obj["genbankAssmAccession"] in parsed:
-                parsed[obj["genbankAssmAccession"]].update(annot)
-                return
-            obj.update(annot)
-    bioprojects = []
-    for lineage in assemblyInfo.get("bioprojectLineage", []):
-        for bioproject in lineage["bioprojects"]:
-            bioprojects.append(bioproject["accession"])
-    obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
-    assemblyStats = record.get("assemblyStats", {})
-    obj.update(assemblyStats)
-    wgsInfo = record.get("wgsInfo", {})
-    for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
-        obj[key] = wgsInfo.get(key, None)
-    parsed[obj["genbankAssmAccession"]] = obj
+# def parse_ncbi_datasets_summary(record, parsed):
+#     """Parse a single NCBI datasets summary."""
+#     obj = {}
+#     return
+#     for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
+#         obj[key] = record.get(key, None)
+#     assemblyInfo = record.get("assemblyInfo", {})
+#     for key in ("assembly_category", "assembly_level"):
+#         obj[key] = assemblyInfo.get(key, None)
+#     # "assembly_accession": "GCF_900239965.1",
+#     if obj["refseqAssmAccession"] == "na":
+#         obj["refseqAssmAccession"] = None
+#         obj["refseqCategory"] = None
+#     annotationInfo = record.get("annotationInfo", {})
+#     if annotationInfo:
+#         annot = {}
+#         for key in ("name", "releaseDate", "reportUrl", "source"):
+#             annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
+#         if annot and "stats" in annotationInfo:
+#             geneCounts = annotationInfo["stats"].get("geneCounts", None)
+#             for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
+#                 annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
+#             if obj["genbankAssmAccession"] in parsed:
+#                 parsed[obj["genbankAssmAccession"]].update(annot)
+#                 return
+#             obj.update(annot)
+#     bioprojects = []
+#     for lineage in assemblyInfo.get("bioprojectLineage", []):
+#         for bioproject in lineage["bioprojects"]:
+#             bioprojects.append(bioproject["accession"])
+#     obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
+#     assemblyStats = record.get("assemblyStats", {})
+#     obj.update(assemblyStats)
+#     wgsInfo = record.get("wgsInfo", {})
+#     for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
+#         obj[key] = wgsInfo.get(key, None)
+#     parsed[obj["genbankAssmAccession"]] = obj
 
 
-def ncbi_datasets_summary_parser(_params, opts):
-    """Fetch and parse NCBI Datasets summary."""
-    parsed = {}
-    datasets = check_output(
-        ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
-    )
-    data = ujson.loads(datasets)
-    if "assemblies" not in data:
-        LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
-        print(data)
-        sys.exit(1)
-    for record in data["assemblies"]:
-        parse_ncbi_datasets_summary(record, parsed)
-    print(parsed)
-    quit()
-    # parsed = {}
-    # with tofile.open_file_handle(
-    #     "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
-    # ) as report:
-    #     for line in report:
-    #         record = ujson.loads(line)
-    #         parse_ncbi_datasets_record(record, parsed)
-    return [value for value in parsed.values()]
+# def ncbi_datasets_summary_parser(_params, opts):
+#     """Fetch and parse NCBI Datasets summary."""
+#     parsed = {}
+#     datasets = check_output(
+#         ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
+#     )
+#     data = ujson.loads(datasets)
+#     if "assemblies" not in data:
+#         LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
+#         print(data)
+#         sys.exit(1)
+#     for record in data["assemblies"]:
+#         parse_ncbi_datasets_summary(record, parsed)
+#     print(parsed)
+#     quit()
+#     # parsed = {}
+#     # with tofile.open_file_handle(
+#     #     "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
+#     # ) as report:
+#     #     for line in report:
+#     #         record = ujson.loads(line)
+#     #         parse_ncbi_datasets_record(record, parsed)
+#     return [value for value in parsed.values()]
diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py
index 43e47401..64f80dbd 100644
--- a/src/genomehubs/lib/parse.py
+++ b/src/genomehubs/lib/parse.py
@@ -7,7 +7,6 @@
     genomehubs parse [--btk] [--btk-root STRING...]
                      [--wikidata PATH] [--wikidata-root STRING...] [--wikidata-xref STRING...]
                      [--gbif] [--gbif-root STRING...] [--gbif-xref STRING...]
-                     [--ncbi-datasets-summary INT]
                      [--ncbi-datasets-genome PATH] [--outfile PATH]
                      [--refseq-mitochondria] [--refseq-organelles]
                      [--refseq-plastids] [--refseq-root NAME]
@@ -22,7 +21,6 @@
     --wikidata PATH              Parse taxa in WikiData dump
     --wikidata-root STRING       WikiData taxon ID of root taxon
     --wikidata-xref STRING       Include link to external reference from WikiData (e.g. NBN, BOLD)
-    --ncbi-datasets-summary INT  Fetch and parse NCBI Datasets summary for a root taxId
     --ncbi-datasets-genome PATH  Parse NCBI Datasets genome directory
     --outfile PATH               Save parsed output to file
     --refseq-mitochondria        Parse mitochondrial genomes from the NCBI RefSeq
@@ -49,7 +47,8 @@
 from .gbif import gbif_parser
 from .hub import load_types
 from .hub import order_parsed_fields
-from .ncbi import ncbi_datasets_summary_parser
+
+# from .ncbi import ncbi_datasets_summary_parser
 from .ncbi import ncbi_genome_parser
 from .ncbi import refseq_organelle_parser
 from .version import __version__
@@ -65,11 +64,11 @@
         "params": None,
         "types": "assembly",
     },
-    "ncbi-datasets-summary": {
-        "func": ncbi_datasets_summary_parser,
-        "params": None,
-        "types": "assembly",
-    },
+    # "ncbi-datasets-summary": {
+    #     "func": ncbi_datasets_summary_parser,
+    #     "params": None,
+    #     "types": "assembly",
+    # },
     "refseq-mitochondria": {
         "func": refseq_organelle_parser,
         "params": ("mitochondrion"),
diff --git a/src/genomehubs/templates/assembly.types.yaml b/src/genomehubs/templates/assembly.types.yaml
index cb716fb8..8c3c210e 100644
--- a/src/genomehubs/templates/assembly.types.yaml
+++ b/src/genomehubs/templates/assembly.types.yaml
@@ -8,8 +8,8 @@ file:
 taxonomy:
   taxon_id:
     header: taxId
-  species:
-    header: speciesName
+  taxon:
+    header: organismName
 names:
   common_name:
     header: commonName
@@ -133,10 +133,12 @@ attributes:
     taxon_name: assembly_span
     taxon_key: assembly_span
     taxon_display_name: Assembly span
-    taxon_summary: median
+    taxon_summary:
+      - median
+      - min
+      - max
     taxon_traverse: median
     taxon_traverse_direction: both
-    taxon_traverse_limit: superkingdom
     taxon_display_level: 1
     taxon_bins:
       min: 6
@@ -161,6 +163,21 @@ attributes:
     header: contigN50
     type: long
     units: bases
+    taxon_display_group: assembly
+    taxon_name: contig_n50
+    taxon_key: contig_n50
+    taxon_summary:
+      - median
+      - min
+      - max
+    taxon_traverse: median
+    taxon_traverse_direction: both
+    taxon_display_level: 2
+    taxon_bins:
+      min: 4
+      max: 9
+      count: 10
+      scale: log10
   contig_l50:
     display_group: metrics
     display_level: 2
@@ -179,6 +196,21 @@ attributes:
     header: scaffoldN50
     type: long
     units: bases
+    taxon_display_group: assembly
+    taxon_name: scaffold_n50
+    taxon_key: scaffold_n50
+    taxon_summary:
+      - median
+      - min
+      - max
+    taxon_traverse: median
+    taxon_traverse_direction: both
+    taxon_display_level: 2
+    taxon_bins:
+      min: 4
+      max: 9
+      count: 10
+      scale: log10
   scaffold_l50:
     display_group: metrics
     display_level: 2

From a3baf237c5d4b8b0049e0386881c0a755cf83ee7 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Thu, 8 Apr 2021 15:32:45 +0100
Subject: [PATCH 3/5] set refseq as primary value

---
 src/genomehubs/lib/hub.py                    |  9 ++++++---
 src/genomehubs/lib/ncbi.py                   | 14 ++++++++++----
 src/genomehubs/templates/assembly.types.yaml |  5 +++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index a068f9a1..0dd5a0bd 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -550,9 +550,12 @@ def process_row(types, names, row):
     taxon_data = {}
     taxon_types = {}
     if "is_primary_value" in data["metadata"]:
-        data["metadata"]["is_primary_value"] = bool(
-            int(data["metadata"]["is_primary_value"])
-        )
+        try:
+            data["metadata"]["is_primary_value"] = bool(
+                int(data["metadata"]["is_primary_value"])
+            )
+        except ValueError:
+            data["metadata"]["is_primary_value"] = False
     for attr_type in list(["attributes", "identifiers"]):
         if attr_type in data and data[attr_type]:
             (
diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py
index 26272348..32b4c5ec 100644
--- a/src/genomehubs/lib/ncbi.py
+++ b/src/genomehubs/lib/ncbi.py
@@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts):
     return parsed
 
 
-def refseq_organelle_parser(collections, opts, *args, **kwargs):
+def refseq_organelle_parser(collections, opts):
     """Fetch and parse RefSeq organelle collections."""
     parsed = []
     if isinstance(collections, tuple):
@@ -190,7 +190,7 @@ def parse_ncbi_datasets_record(record, parsed):
     """Parse a single NCBI datasets record."""
     obj = {}
     for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
-        obj[key] = record.get(key, None)
+        obj[key] = record.get(key, "None")
     assemblyInfo = record.get("assemblyInfo", {})
     for key in (
         "assemblyLevel",
@@ -204,9 +204,15 @@ def parse_ncbi_datasets_record(record, parsed):
         "submitter",
     ):
         obj[key] = assemblyInfo.get(key, None)
+        if key == "refseqCategory":
+            if obj[key] == "representative genome":
+                obj["primaryValue"] = 1
+            else:
+                obj["primaryValue"] = None
     if obj["refseqAssmAccession"] == "na":
         obj["refseqAssmAccession"] = None
         obj["refseqCategory"] = None
+        obj["primaryValue"] = None
     annotationInfo = record.get("annotationInfo", {})
     if annotationInfo:
         annot = {}
@@ -233,11 +239,11 @@ def parse_ncbi_datasets_record(record, parsed):
     parsed[obj["genbankAssmAccession"]] = obj
 
 
-def ncbi_genome_parser(directory, opts, *args, **kwargs):
+def ncbi_genome_parser(_params, opts, *, types=None, names=None):
     """Parse NCBI Datasets genome report."""
     parsed = {}
     with tofile.open_file_handle(
-        "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
+        "%s/ncbi_dataset/data/assembly_data_report.jsonl" % opts["ncbi-datasets-genome"]
     ) as report:
         for line in report:
             record = ujson.loads(line)
diff --git a/src/genomehubs/templates/assembly.types.yaml b/src/genomehubs/templates/assembly.types.yaml
index 8c3c210e..78f39216 100644
--- a/src/genomehubs/templates/assembly.types.yaml
+++ b/src/genomehubs/templates/assembly.types.yaml
@@ -134,6 +134,7 @@ attributes:
     taxon_key: assembly_span
     taxon_display_name: Assembly span
     taxon_summary:
+      - primary
       - median
       - min
       - max
@@ -167,6 +168,7 @@ attributes:
     taxon_name: contig_n50
     taxon_key: contig_n50
     taxon_summary:
+      - primary
       - median
       - min
       - max
@@ -200,6 +202,7 @@ attributes:
     taxon_name: scaffold_n50
     taxon_key: scaffold_n50
     taxon_summary:
+      - primary
       - median
       - min
       - max
@@ -274,3 +277,5 @@ attributes:
 metadata:
   source_slug:
     header: genbankAssmAccession
+  is_primary_value:
+    header: primaryValue

From 11c5ede5b34265f103ab2b31d0f5641138218488 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Fri, 9 Apr 2021 13:39:17 +0100
Subject: [PATCH 4/5] set source when taxon data derived from assembly

---
 src/genomehubs/lib/attributes.py | 53 +++++++++++++++++++++++++++++---
 src/genomehubs/lib/hub.py        |  5 ++-
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/genomehubs/lib/attributes.py b/src/genomehubs/lib/attributes.py
index aefc5e61..0d191e91 100644
--- a/src/genomehubs/lib/attributes.py
+++ b/src/genomehubs/lib/attributes.py
@@ -37,18 +37,63 @@ def index(es, group, attributes, opts, *, index_type="attribute"):
     return template, stream
 
 
+# def fetch_types(es, opts):
+#     """Fetch all existing types."""
+#     template = index_template(opts, index_type="attribute")
+#     body = {
+#         "id": "attribute_types",
+#         "params": {},
+#     }
+#     entries = stream_template_search_results(
+#         es, index=template["index_name"], body=body
+#     )
+#     return {entry["key"]: entry for entry in entries}
+
+
+def add_attribute_sources(name, obj, attributes):
+    """Generate a list of attribute sources."""
+    for key, value in attributes[name].items():
+        if key.startswith("source"):
+            if key in obj:
+                if not isinstance(obj[key], list):
+                    obj[key] = [obj[key]]
+                obj[key].append(value)
+            else:
+                obj[key] = value
+
+
 def index_types(es, types_name, types, opts):
     """Index types into Elasticsearch."""
+    # TODO: fetch existing types to allow new sources to add, not overwrite
+    try:
+        attributes = fetch_types(es, types_name, opts)
+    except Exception:
+        attributes = {}
     if "attributes" in types:
-        if "defaults" in types and "attributes" in types["defaults"]:
-            for key, value in types["attributes"].items():
+        new_attributes = {}
+        existing_attributes = {}
+        for key, value in types["attributes"].items():
+            if "defaults" in types and "attributes" in types["defaults"]:
                 value = {**types["defaults"]["attributes"], **value}
-                types["attributes"][key] = value
+                # types["attributes"][key] = value
+            if key in attributes:
+                existing_attributes[key] = value
+                add_attribute_sources(key, value, attributes)
+            else:
+                new_attributes[key] = value
         template, stream = index(
-            es, types_name, types["attributes"], opts, index_type="attribute"
+            es, types_name, new_attributes, opts, index_type="attribute"
+        )
+        template, update_stream = index(
+            es,
+            types_name,
+            existing_attributes,
+            opts,
+            index_type="attribute",
         )
         load_mapping(es, template["name"], template["mapping"])
         index_stream(es, template["index_name"], stream)
+        index_stream(es, template["index_name"], update_stream, _op_type="update")
     if "taxon_names" in types:
         if "defaults" in types and "taxon_names" in types["defaults"]:
             for key, value in types["names"].items():
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 0dd5a0bd..6950ddd8 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -356,7 +356,9 @@ def add_attributes(
                 else:
                     attribute = {"identifier": validated, "class": key}
                 attribute.update(meta)
-                if source is not None:
+                if "source" in types[key]:
+                    attribute.update({"source": types[key]["source"]})
+                elif source is not None:
                     attribute.update({"source": source})
                 attributes.append(attribute)
     if attribute_values:
@@ -470,6 +472,7 @@ def validate_types_file(types_file, dir_path):
         if key.startswith("display") or key.startswith("taxon"):
             defaults["attributes"].update({key: value})
         elif key.startswith("source"):
+            defaults["attributes"].update({key: value})
             defaults["metadata"].update({key: value})
     types.update({"defaults": defaults})
     data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"])

From 4f3650737a4cd6a2ce06a671b04a2f0304998d42 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Fri, 9 Apr 2021 13:58:28 +0100
Subject: [PATCH 5/5] show assembly level in taxon index

---
 src/genomehubs/templates/assembly.types.yaml | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/genomehubs/templates/assembly.types.yaml b/src/genomehubs/templates/assembly.types.yaml
index 78f39216..22228938 100644
--- a/src/genomehubs/templates/assembly.types.yaml
+++ b/src/genomehubs/templates/assembly.types.yaml
@@ -63,7 +63,11 @@ attributes:
     taxon_display_group: assembly
     taxon_name: gene_count
     taxon_key: gene_count
-    taxon_summary: median
+    taxon_summary:
+      - primary
+      - median
+      - min
+      - max
     taxon_display_level: 2
     taxon_bins:
       min: 0
@@ -103,7 +107,9 @@ attributes:
     taxon_name: sample_sex
     taxon_key: sample_sex
     taxon_display_name: Sample sex
-    taxon_summary: list
+    taxon_summary:
+      - primary
+      - list
     taxon_display_level: 2
   isolate:
     display_level: 2
@@ -115,6 +121,14 @@ attributes:
     header: assemblyLevel
     display_name: Assembly level
     type: keyword
+    taxon_display_group: assembly
+    taxon_name: assembly_level
+    taxon_key: assembly_level
+    taxon_display_name: Assembly level
+    taxon_summary:
+      - primary
+      - list
+    taxon_display_level: 2
   assembly_type:
     display_level: 2
     header: assemblyType