From 4576702bf101a88d78a2f67cfcaf79dfd12935fb Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 3 Mar 2021 09:13:47 +0000
Subject: [PATCH 01/21] Write successful imports to file Fixes #45

---
 src/genomehubs/lib/hub.py   | 28 ++++++++++++++++++++++++++++
 src/genomehubs/lib/index.py | 13 +++++++++++--
 src/genomehubs/lib/taxon.py | 24 +++++-------------------
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 6da6d4d4..e3f1315a 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -503,3 +503,31 @@ def set_column_indices(types, header):
                     index = headers.get(value["header"], None)
                     if index is not None:
                         value.update({"index": index})
+
+
+def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
+    """Write imported rows to processed file."""
+    file_key = "%s-exception" % opts["index"]
+    dir_key = "%s-dir" % opts["index"]
+    if file_key in opts and opts[file_key]:
+        outdir = opts[file_key]
+    else:
+        outdir = "%s/%s" % (opts[dir_key], label)
+    os.makedirs(outdir, exist_ok=True)
+    outfile = "%s/%s" % (outdir, types["file"]["name"])
+    data = []
+    header_len = 0
+    if header is not None:
+        data.append(header)
+        header_len = 1
+    if isinstance(rows, dict):
+        for row_set in rows.values():
+            for row in row_set:
+                data.append(row)
+    else:
+        for row in rows:
+            data.append(row)
+    LOGGER.info(
+        "Writing %d records to %s file '%s", len(data) - header_len, label, outfile
+    )
+    tofile.write_file(outfile, data)
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 5a1c0445..344c5e0b 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -71,6 +71,7 @@
 from .hub import process_row
 from .hub import set_column_indices
 from .hub import validate_types_file
+from .hub import write_imported_rows
 from .taxon import add_names_and_attributes_to_taxa
 from .taxon import fix_missing_ids
 from .version import __version__
@@ -84,14 +85,16 @@ def index_file(es, types, data, opts):
     rows = csv.reader(
         data, delimiter=delimiters[types["file"]["format"]], quotechar='"'
     )
-    header = None
-    if types["file"].get("header", False):
+    if "header" in types["file"] and types["file"]["header"]:
         header = next(rows)
         set_column_indices(types, header)
+    else:
+        header = None
     with_ids = defaultdict(list)
     taxon_asm_data = defaultdict(list)
     without_ids = defaultdict(list)
     failed_rows = defaultdict(list)
+    imported_rows = []
     blanks = set(["", "NA", "N/A", "None"])
     taxon_types = {}
     for taxonomy_name in opts["taxonomy-source"]:
@@ -112,6 +115,7 @@ def index_file(es, types, data, opts):
                 taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(
                     taxon_data
                 )
+                imported_rows.append(row)
             else:
                 if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
                     without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append(
@@ -147,11 +151,15 @@ def index_file(es, types, data, opts):
             types=types,
             taxon_template=taxon_template,
             failed_rows=failed_rows,
+            imported_rows=imported_rows,
             with_ids=with_ids,
             blanks=blanks,
             header=header,
         )
         if with_ids or create_ids:
+            write_imported_rows(
+                imported_rows, opts, types=types, header=header, label="imported"
+            )
             LOGGER.info("Indexing %d entries", len(with_ids.keys()))
             if opts["index"] == "taxon":
                 docs = add_names_and_attributes_to_taxa(
@@ -164,6 +172,7 @@ def index_file(es, types, data, opts):
                     _op_type="update",
                 )
             elif opts["index"] == "assembly":
+                # TODO: keep track of taxon_id not found exceptions
                 assembly_template = assembly.index_template(taxonomy_name, opts)
                 docs = add_identifiers_and_attributes_to_assemblies(
                     es,
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index 19187de3..c60e7dd6 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -2,11 +2,9 @@
 
 """Taxon methods."""
 
-import os
 import sys
 from collections import defaultdict
 
-from tolkein import tofile
 from tolkein import tolog
 from tqdm import tqdm
 
@@ -17,6 +15,7 @@
 from .hub import add_attribute_values
 from .hub import chunks
 from .hub import index_templator
+from .hub import write_imported_rows
 from .taxonomy import index_template as taxonomy_index_template
 
 LOGGER = tolog.logger(__name__)
@@ -177,6 +176,7 @@ def fix_missing_ids(
     types,
     taxon_template,
     failed_rows,
+    imported_rows,
     with_ids=None,
     blanks=set(["NA", "None"]),
     header=None,
@@ -211,29 +211,15 @@ def fix_missing_ids(
     if without_ids and failed_rows:
         for key, value in found_ids.items():
             if key in failed_rows:
+                imported_rows += failed_rows[key]
                 del failed_rows[key]
         if failed_rows:
             LOGGER.info(
                 "Unable to associate %d records with taxon IDs", len(failed_rows)
             )
-            data = []
-            exception_key = "%s-exception" % opts["index"]
-            dir_key = "%s-dir" % opts["index"]
-            if exception_key in opts and opts[exception_key]:
-                outdir = opts[exception_key]
-            else:
-                outdir = "%s/exceptions" % opts[dir_key]
-            os.makedirs(outdir, exist_ok=True)
-            outfile = "%s/%s" % (outdir, types["file"]["name"])
-            if header:
-                data.append(header)
-            for rows in failed_rows.values():
-                for row in rows:
-                    data.append(row)
-            LOGGER.info(
-                "Writing %d records to exceptions file '%s", len(data) - 1, outfile
+            write_imported_rows(
+                failed_rows, opts, types=types, header=header, label="exceptions"
             )
-            tofile.write_file(outfile, data)
     return with_ids, without_ids
 
 

From 06011697fbc165dff25cb125233b52d44a7bac15 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 3 Mar 2021 09:16:04 +0000
Subject: [PATCH 02/21] remove v from version number

---
 .bumpversion.cfg       | 7 ++++---
 README.rst             | 4 ++--
 conda-recipe/meta.yaml | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 4505724e..e23c4429 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -2,7 +2,8 @@
 current_version = 2.0.5
 commit = True
 tag = True
-message = 'Bump version: {current_version} → {new_version}'
+tag_name = {new_version}
+message = "Bump version: {current_version} → {new_version}"
 
 [bumpversion:file:setup.py]
 search = version="{current_version}"
@@ -13,8 +14,8 @@ search = version = "{current_version}"
 replace = version = "{new_version}"
 
 [bumpversion:file:README.rst]
-search = v{current_version}.
-replace = v{new_version}.
+search = {current_version}.
+replace = {new_version}.
 
 [bumpversion:file:docs/conf.py]
 search = version = release = "{current_version}"
diff --git a/README.rst b/README.rst
index e2e31325..6ad93b1b 100644
--- a/README.rst
+++ b/README.rst
@@ -42,9 +42,9 @@ GenomeHubs
     :alt: Conda platforms
     :target: https://anaconda.org/tolkit/genomehubs
 
-.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/v2.0.5.svg
+.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.5.svg
     :alt: Commits since latest release
-    :target: https://github.com/genomehubs/genomehubs/compare/v2.0.5...main
+    :target: https://github.com/genomehubs/genomehubs/compare/2.0.5...main
 
 .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg
     :alt: MIT License
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index c5427238..ca587a7a 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -6,7 +6,7 @@ package:
   version: {{ version }}
 
 source:
-  git_rev: v{{ version }}
+  git_rev: {{ version }}
   git_url: https://github.com/genomehubs/genomehubs.git
 
 build:

From c6c9c04be1b2feeec94a0620397547478b8c76d5 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 3 Mar 2021 09:17:52 +0000
Subject: [PATCH 03/21] =?UTF-8?q?"Bump=20version:=202.0.5=20=E2=86=92=202.?=
 =?UTF-8?q?0.6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.cfg              | 2 +-
 README.rst                    | 4 ++--
 conda-recipe/meta.yaml        | 2 +-
 docs/conf.py                  | 2 +-
 scripts/conda_build.sh        | 2 +-
 setup.py                      | 2 +-
 src/genomehubs/lib/version.py | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index e23c4429..6b7aaee3 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.5
+current_version = 2.0.6
 commit = True
 tag = True
 tag_name = {new_version}
diff --git a/README.rst b/README.rst
index 6ad93b1b..c8796d7e 100644
--- a/README.rst
+++ b/README.rst
@@ -42,9 +42,9 @@ GenomeHubs
     :alt: Conda platforms
     :target: https://anaconda.org/tolkit/genomehubs
 
-.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.5.svg
+.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.6.svg
     :alt: Commits since latest release
-    :target: https://github.com/genomehubs/genomehubs/compare/2.0.5...main
+    :target: https://github.com/genomehubs/genomehubs/compare/2.0.6...main
 
 .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg
     :alt: MIT License
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index ca587a7a..06d22bc5 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "genomehubs" %}
-{% set version = "2.0.5" %}
+{% set version = "2.0.6" %}
 
 package:
   name: {{ name }}
diff --git a/docs/conf.py b/docs/conf.py
index 014df512..b6dac8a1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,7 +29,7 @@
     version = release = get_distribution("genomehubs").version
 except Exception:
     traceback.print_exc()
-    version = release = "2.0.5"
+    version = release = "2.0.6"
 
 pygments_style = "trac"
 templates_path = ["."]
diff --git a/scripts/conda_build.sh b/scripts/conda_build.sh
index 59173100..79064de9 100755
--- a/scripts/conda_build.sh
+++ b/scripts/conda_build.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-VERSION=2.0.5
+VERSION=2.0.6
 
 case $(uname | tr '[:upper:]' '[:lower:]') in
   linux*)
diff --git a/setup.py b/setup.py
index f0952bee..ab311ee9 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def read(*names, **kwargs):
 
 setup(
     name="genomehubs",  # Required
-    version="2.0.5",
+    version="2.0.6",
     description="GenomeHubs",  # Optional
     long_description="%s\n%s"
     % (
diff --git a/src/genomehubs/lib/version.py b/src/genomehubs/lib/version.py
index 5edc0d68..bab7ca25 100644
--- a/src/genomehubs/lib/version.py
+++ b/src/genomehubs/lib/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python3
 """genomehubs version."""
 
-__version__ = "2.0.5"
+__version__ = "2.0.6"

From 0bd67ac114a74452e408aa8b4332313c342e7a77 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 3 Mar 2021 09:20:16 +0000
Subject: [PATCH 04/21] =?UTF-8?q?"Bump=20version:=202.0.6=20=E2=86=92=202.?=
 =?UTF-8?q?0.7"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.cfg              | 2 +-
 README.rst                    | 4 ++--
 conda-recipe/meta.yaml        | 2 +-
 docs/conf.py                  | 2 +-
 scripts/conda_build.sh        | 2 +-
 setup.py                      | 2 +-
 src/genomehubs/lib/version.py | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 6b7aaee3..fcd1fde8 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.6
+current_version = 2.0.7
 commit = True
 tag = True
 tag_name = {new_version}
diff --git a/README.rst b/README.rst
index c8796d7e..e48cb478 100644
--- a/README.rst
+++ b/README.rst
@@ -42,9 +42,9 @@ GenomeHubs
     :alt: Conda platforms
     :target: https://anaconda.org/tolkit/genomehubs
 
-.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.6.svg
+.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.7.svg
     :alt: Commits since latest release
-    :target: https://github.com/genomehubs/genomehubs/compare/2.0.6...main
+    :target: https://github.com/genomehubs/genomehubs/compare/2.0.7...main
 
 .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg
     :alt: MIT License
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 06d22bc5..f8add197 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "genomehubs" %}
-{% set version = "2.0.6" %}
+{% set version = "2.0.7" %}
 
 package:
   name: {{ name }}
diff --git a/docs/conf.py b/docs/conf.py
index b6dac8a1..1a6b55b3 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,7 +29,7 @@
     version = release = get_distribution("genomehubs").version
 except Exception:
     traceback.print_exc()
-    version = release = "2.0.6"
+    version = release = "2.0.7"
 
 pygments_style = "trac"
 templates_path = ["."]
diff --git a/scripts/conda_build.sh b/scripts/conda_build.sh
index 79064de9..45afbb26 100755
--- a/scripts/conda_build.sh
+++ b/scripts/conda_build.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-VERSION=2.0.6
+VERSION=2.0.7
 
 case $(uname | tr '[:upper:]' '[:lower:]') in
   linux*)
diff --git a/setup.py b/setup.py
index ab311ee9..cd1829ed 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def read(*names, **kwargs):
 
 setup(
     name="genomehubs",  # Required
-    version="2.0.6",
+    version="2.0.7",
     description="GenomeHubs",  # Optional
     long_description="%s\n%s"
     % (
diff --git a/src/genomehubs/lib/version.py b/src/genomehubs/lib/version.py
index bab7ca25..0e87ce51 100644
--- a/src/genomehubs/lib/version.py
+++ b/src/genomehubs/lib/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python3
 """genomehubs version."""
 
-__version__ = "2.0.6"
+__version__ = "2.0.7"

From c83d203e9c1c2a1165d01ccfb82b9065d5433a06 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Thu, 4 Mar 2021 09:23:54 +0000
Subject: [PATCH 05/21] check for blanks when grouping taxa without IDs

---
 src/genomehubs/lib/hub.py   |  2 ++
 src/genomehubs/lib/index.py | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index e3f1315a..80a33e4e 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -454,6 +454,8 @@ def process_row(types, row):
     for group in data.keys():
         if group in types:
             for key, meta in types[group].items():
+                if "index" not in meta:
+                    continue
                 try:
                     if isinstance(meta["index"], list):
                         char = meta.get("join", "")
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 344c5e0b..75e32426 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -79,6 +79,11 @@
 LOGGER = tolog.logger(__name__)
 
 
+def not_blank(key, obj, blanks):
+    """Test value is not blank."""
+    return key in obj and obj[key] and obj[key] not in blanks
+
+
 def index_file(es, types, data, opts):
     """Index a file."""
     delimiters = {"csv": ",", "tsv": "\t"}
@@ -107,17 +112,16 @@ def index_file(es, types, data, opts):
                 failed_rows["None"].append(row)
                 continue
             taxon_types.update(new_taxon_types)
-            if (
-                "taxon_id" in processed_data["taxonomy"]
-                and processed_data["taxonomy"]["taxon_id"] not in blanks
-            ):
+            if not_blank("taxon_id", processed_data["taxonomy"], blanks):
                 with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data)
                 taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(
                     taxon_data
                 )
                 imported_rows.append(row)
             else:
-                if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
+                if "taxonomy" in types and not_blank(
+                    "alt_taxon_id", processed_data["taxonomy"], blanks
+                ):
                     without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append(
                         processed_data
                     )
@@ -125,7 +129,7 @@ def index_file(es, types, data, opts):
                         taxon_data
                     )
                     failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row)
-                elif "subspecies" in processed_data["taxonomy"]:
+                elif not_blank("subspecies", processed_data["taxonomy"], blanks):
                     without_ids[processed_data["taxonomy"]["subspecies"]].append(
                         processed_data
                     )
@@ -133,7 +137,7 @@ def index_file(es, types, data, opts):
                         taxon_data
                     )
                     failed_rows[processed_data["taxonomy"]["subspecies"]].append(row)
-                elif "species" in processed_data["taxonomy"]:
+                elif not_blank("species", processed_data["taxonomy"], blanks):
                     without_ids[processed_data["taxonomy"]["species"]].append(
                         processed_data
                     )

From 638dd3226d62638fc8adbc2c3a6a82e485eb4a65 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Tue, 9 Mar 2021 09:35:51 +0000
Subject: [PATCH 06/21] keep descendant nodes in memory during fill Fixes #50

---
 src/genomehubs/lib/fill.py | 122 +++++++++++++++++++++++++++++--------
 1 file changed, 98 insertions(+), 24 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index 023c154e..4c25bc4d 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -59,11 +59,6 @@
 from .es_functions import stream_template_search_results
 from .version import __version__
 
-# if platform.system() != "Linux":
-#     from multiprocessing import set_start_method
-
-#     set_start_method("fork")
-
 LOGGER = tolog.logger(__name__)
 
 
@@ -201,8 +196,10 @@ def summarise_attribute_values(
 def summarise_attributes(*, attributes, attrs, meta, parent, parents):
     """Set attribute summary values."""
     changed = False
+    attr_dict = {}
     for node_attribute in attributes:
         if node_attribute["key"] in attrs:
+            attr_dict[node_attribute["key"]] = node_attribute
             summary_value, max_value, min_value = summarise_attribute_values(
                 node_attribute, meta[node_attribute["key"]]
             )
@@ -225,14 +222,25 @@ def summarise_attributes(*, attributes, attrs, meta, parent, parents):
                         parents[parent][node_attribute["key"]]["min"] = min(
                             parents[parent][node_attribute["key"]]["min"], min_value
                         )
-    return changed
+    return changed, attr_dict
 
 
 def set_values_from_descendants(
-    *, attributes, descendant_values, meta, parent, taxon_rank, parents
+    *,
+    attributes,
+    descendant_values,
+    meta,
+    taxon_id,
+    parent,
+    taxon_rank,
+    parents,
+    attr_dict=None,
+    limits=None
 ):
     """Set attribute summary values from descendant values."""
     changed = False
+    if attr_dict is None:
+        attr_dict = {}
     for key, obj in descendant_values.items():
         traverseable = meta[key].get("traverse", False)
         if (
@@ -243,9 +251,11 @@ def set_values_from_descendants(
             traverseable = False
         if not traverseable:
             continue
+        if taxon_id in limits[key]:
+            continue
         traverse_limit = meta[key].get("traverse_limit", None)
         if traverse_limit and taxon_rank == traverse_limit:
-            continue
+            limits[key].add(parent)
         try:
             attribute = next(entry for entry in attributes if entry["key"] == key)
         except StopIteration:
@@ -261,8 +271,7 @@ def set_values_from_descendants(
         if summary_value is not None:
             attribute["aggregation_source"] = "descendant"
             changed = True
-            if traverse_limit and taxon_rank == traverse_limit:
-                continue
+            attr_dict.update({key: attribute})
             if parent is not None:
                 if isinstance(summary_value, list):
                     parents[parent][key]["values"] = list(
@@ -278,7 +287,60 @@ def set_values_from_descendants(
                     parents[parent][key]["min"] = min(
                         parents[parent][key]["min"], min_value
                     )
-    return changed
+    return changed, attr_dict
+
+
+def set_attributes_to_descend(meta):
+    """Set which attributes should have values inferred from ancestral taxa."""
+    desc_attrs = set()
+    desc_attr_limits = {}
+    for key, value in meta.items():
+        if "traverse" in value and value["traverse"]:
+            if "traverse_direction" not in value or value["traverse_direction"] in (
+                "down",
+                "both",
+            ):
+                desc_attrs.add(key)
+                if "traverse_limit" in value:
+                    desc_attr_limits.update({key: value["traverse_limit"]})
+    return desc_attrs, desc_attr_limits
+
+
+def track_missing_attribute_values(
+    node, missing_attributes, attr_dict, desc_attrs, desc_attr_limits
+):
+    """Keep track of missing attribute values for in memory traversal."""
+    missing_from_descendants = {}
+    if (
+        node["_source"]["taxon_id"] in missing_attributes
+        and missing_attributes[node["_source"]["taxon_id"]]
+    ):
+        for child_id, obj in missing_attributes[node["_source"]["taxon_id"]].items():
+            for key, attribute in attr_dict.items():
+                if key in obj["keys"]:
+                    # update aggregation source here
+                    # TODO: #51 include ancestral rank in aggregation source
+                    obj["attributes"].append(
+                        {**attribute, "aggregation_source": "ancestor"}
+                    )
+                    obj["keys"].remove(key)
+            if obj["keys"]:
+                missing_from_descendants.update({child_id: obj})
+            else:
+                # yield when all values filled or removed
+                yield obj["node"]["_id"], obj["node"]["_source"]
+        del missing_attributes[node["_source"]["taxon_id"]]
+    if "parent" in node["_source"]:
+        missing_attributes[node["_source"]["parent"]].update(missing_from_descendants)
+        missing_attributes[node["_source"]["parent"]].update(
+            {
+                node["_source"]["taxon_id"]: {
+                    "keys": set({key for key in desc_attrs if key not in attr_dict}),
+                    "attributes": node["_source"]["attributes"],
+                    "node": node,
+                }
+            }
+        )
 
 
 def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
@@ -299,6 +361,12 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
             lambda: {"max": float("-inf"), "min": float("inf"), "values": []}
         )
     )
+    limits = defaultdict(set)
+    if "traverse-infer-both" in opts and opts["traverse-infer-both"]:
+        desc_attrs, desc_attr_limits = set_attributes_to_descend(meta)
+        missing_attributes = defaultdict(dict)
+    else:
+        desc_attrs = {}
     while root_depth >= 0:
         nodes = stream_nodes_by_root_depth(
             es,
@@ -309,32 +377,45 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
         )
         ctr = 0
         for node in nodes:
+            # TODO: break into sub functions
             ctr += 1
             changed = False
+            attr_dict = {}
             if "attributes" in node["_source"] and node["_source"]["attributes"]:
-                changed = summarise_attributes(
+                changed, attr_dict = summarise_attributes(
                     attributes=node["_source"]["attributes"],
                     attrs=attrs,
                     meta=meta,
                     parent=node["_source"].get("parent", None),
                     parents=parents,
                 )
+            else:
+                node["_source"]["attributes"] = []
             if node["_source"]["taxon_id"] in parents:
-                if "attributes" not in node["_source"]:
-                    node["_source"]["attributes"] = []
-                modified = set_values_from_descendants(
+                modified, attr_dict = set_values_from_descendants(
                     attributes=node["_source"]["attributes"],
                     descendant_values=parents[node["_source"]["taxon_id"]],
                     meta=meta,
+                    taxon_id=node["_source"]["taxon_id"],
                     parent=node["_source"].get("parent", None),
                     parents=parents,
                     taxon_rank=node["_source"]["taxon_rank"],
+                    attr_dict=attr_dict,
+                    limits=limits,
                 )
                 if not changed:
                     changed = modified
+            if desc_attrs:
+                yield from track_missing_attribute_values(
+                    node, missing_attributes, attr_dict, desc_attrs, desc_attr_limits
+                )
             if changed:
                 yield node["_id"], node["_source"]
         root_depth -= 1
+    if desc_attrs:
+        for incomplete in missing_attributes.values():
+            for obj in incomplete.values():
+                yield obj["node"]["_id"], obj["node"]["_source"]
 
 
 def copy_attribute_summary(source, meta):
@@ -399,9 +480,7 @@ def traverse_from_root(es, opts, *, template, root=None, max_depth=None, log=Tru
         root = opts["traverse-root"]
     if max_depth is None:
         max_depth = get_max_depth_by_lineage(
-            es,
-            index=template["index_name"],
-            root=root,
+            es, index=template["index_name"], root=root
         )
     root_depth = max_depth - 1
     meta = template["types"]["attributes"]
@@ -414,11 +493,7 @@ def traverse_from_root(es, opts, *, template, root=None, max_depth=None, log=Tru
         if log:
             LOGGER.info("Filling values at root depth %d" % root_depth)
         nodes = stream_nodes_by_root_depth(
-            es,
-            index=template["index_name"],
-            root=root,
-            depth=root_depth,
-            size=50,
+            es, index=template["index_name"], root=root, depth=root_depth, size=50
         )
         desc_nodes = stream_missing_attributes_at_level(
             es, nodes=nodes, attrs=attrs, template=template
@@ -494,7 +569,6 @@ def main(args):
     options = config("fill", **args)
     if "traverse-infer-both" in options["fill"]:
         options["fill"]["traverse-infer-ancestors"] = True
-        options["fill"]["traverse-infer-descendants"] = True
 
     # Start Elasticsearch
     es = launch_es(options["fill"])

From 2250481b96769245b47b90edc9778a4ae98bfbf3 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Fri, 19 Mar 2021 11:52:36 +0000
Subject: [PATCH 07/21] Load taxon names as xrefs (#55)

---
 src/genomehubs/lib/attributes.py          | 50 +++++++++++++++--------
 src/genomehubs/lib/fill.py                |  1 +
 src/genomehubs/lib/hub.py                 | 22 +++++++++-
 src/genomehubs/lib/index.py               |  3 +-
 src/genomehubs/lib/taxon.py               | 18 ++++----
 src/genomehubs/templates/attributes.json  |  5 +++
 src/genomehubs/templates/identifiers.json | 38 +++++++++++++++++
 src/genomehubs/templates/taxon.json       | 11 +++++
 8 files changed, 119 insertions(+), 29 deletions(-)
 create mode 100644 src/genomehubs/templates/identifiers.json

diff --git a/src/genomehubs/lib/attributes.py b/src/genomehubs/lib/attributes.py
index 831c49bb..aefc5e61 100644
--- a/src/genomehubs/lib/attributes.py
+++ b/src/genomehubs/lib/attributes.py
@@ -12,42 +12,56 @@
 LOGGER = tolog.logger(__name__)
 
 
-def index_template(opts):
+def index_template(opts, *, index_type="attribute"):
     """Index template (includes name, mapping and types)."""
-    parts = ["attributes", opts["hub-name"], opts["hub-version"]]
+    parts = ["%ss" % index_type, opts["hub-name"], opts["hub-version"]]
     template = index_templator(parts, opts)
     return template
 
 
-def stream_attributes(group, attributes):
+def stream_attributes(group, attributes, *, index_type="attribute"):
     """Stream attributes for indexing."""
     for name, obj in attributes.items():
         ret = {"group": group, "name": name}
         for prop, value in obj.items():
             if not prop.startswith("taxon_"):
                 ret.update({prop: value})
-        yield "attribute-%s-%s" % (group, name), ret
+        yield "%s-%s-%s" % (index_type, group, name), ret
 
 
-def index(es, group, attributes, opts):
-    """Index a set of attributes."""
-    LOGGER.info("Indexing attributes")
-    template = index_template(opts)
-    stream = stream_attributes(group, attributes)
+def index(es, group, attributes, opts, *, index_type="attribute"):
+    """Index a set of attributes or names."""
+    LOGGER.info("Indexing %s" % index_type)
+    template = index_template(opts, index_type=index_type)
+    stream = stream_attributes(group, attributes, index_type=index_type)
     return template, stream
 
 
 def index_types(es, types_name, types, opts):
     """Index types into Elasticsearch."""
-    if "attributes" not in types:
-        return
-    if "defaults" in types:
-        for key, value in types["attributes"].items():
-            value = {**types["defaults"]["attributes"], **value}
-            types["attributes"][key] = value
-    template, stream = index(es, types_name, types["attributes"], opts)
-    load_mapping(es, template["name"], template["mapping"])
-    index_stream(es, template["index_name"], stream)
+    if "attributes" in types:
+        if "defaults" in types and "attributes" in types["defaults"]:
+            for key, value in types["attributes"].items():
+                value = {**types["defaults"]["attributes"], **value}
+                types["attributes"][key] = value
+        template, stream = index(
+            es, types_name, types["attributes"], opts, index_type="attribute"
+        )
+        load_mapping(es, template["name"], template["mapping"])
+        index_stream(es, template["index_name"], stream)
+    if "taxon_names" in types:
+        if "defaults" in types and "taxon_names" in types["defaults"]:
+            for key, value in types["names"].items():
+                value = {
+                    **types["defaults"]["taxon_names"],
+                    **value,
+                }
+                types["taxon_names"][key] = value
+        template, stream = index(
+            es, types_name, types["taxon_names"], opts, index_type="identifier"
+        )
+        load_mapping(es, template["name"], template["mapping"])
+        index_stream(es, template["index_name"], stream)
 
 
 def fetch_types(es, types_name, opts):
diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index 4c25bc4d..c9f9d416 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -254,6 +254,7 @@ def set_values_from_descendants(
         if taxon_id in limits[key]:
             continue
         traverse_limit = meta[key].get("traverse_limit", None)
+        # TODO: #53 catch traverse limits when limit rank is missing
         if traverse_limit and taxon_rank == traverse_limit:
             limits[key].add(parent)
         try:
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 80a33e4e..2c842285 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -432,6 +432,22 @@ def validate_types_file(types_file, dir_path):
     return types, data
 
 
+def set_xrefs(taxon_names, types, row, *, meta=None):
+    """Set xrefs for taxon_names."""
+    if meta is None:
+        meta = {}
+    names = []
+    for name_class, value in taxon_names.items():
+        taxon = {"name": value, "class": name_class}
+        if "xref" in types[name_class] and types[name_class]["xref"]:
+            if "source" in meta:
+                taxon.update({"source": meta["source"]})
+            if "source_stub" in meta:
+                taxon.update({"source_stub": meta["source_stub"]})
+        names.append(taxon)
+    return names
+
+
 def process_row(types, row):
     """Process a row of data."""
     data = {
@@ -479,7 +495,7 @@ def process_row(types, row):
     taxon_data = {}
     taxon_types = {}
     for attr_type in list(["attributes", "identifiers"]):
-        if data[attr_type]:
+        if attr_type in data and data[attr_type]:
             (
                 data[attr_type],
                 taxon_data[attr_type],
@@ -492,6 +508,10 @@ def process_row(types, row):
             )
         else:
             data[attr_type] = []
+    if "taxon_names" in data and data["taxon_names"]:
+        data["taxon_names"] = set_xrefs(
+            data["taxon_names"], types["taxon_names"], row, meta=data["metadata"]
+        )
     return data, taxon_data, taxon_types.get("attributes", {})
 
 
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 75e32426..c10266b2 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -108,7 +108,8 @@ def index_file(es, types, data, opts):
         for row in tqdm(rows):
             try:
                 processed_data, taxon_data, new_taxon_types = process_row(types, row)
-            except Exception:
+            except Exception as err:
+                print(err)
                 failed_rows["None"].append(row)
                 continue
             taxon_types.update(new_taxon_types)
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index c60e7dd6..4302c7c7 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -333,15 +333,15 @@ def add_names_to_list(existing, new, *, blanks=set({"NA", "None"})):
     names = defaultdict(dict)
     for entry in existing:
         names[entry["class"]][entry["name"]] = True
-    for name_class, name in new.items():
-        name_class = name_class.replace("_", " ")
+    for entry in new:
+        entry["class"] = entry["class"].lower()  # .replace("_", " ")
         if (
-            name not in blanks
-            and name_class not in names
-            and name not in names[name_class]
+            entry["name"] not in blanks
+            and entry["class"] not in names
+            and entry["name"] not in names[entry["class"]]
         ):
-            existing.append({"name": name, "class": name_class})
-            names[name_class][name] = True
+            existing.append(entry)
+            names[entry["class"]][entry["name"]] = True
 
 
 def add_names_and_attributes_to_taxa(
@@ -363,13 +363,13 @@ def add_names_and_attributes_to_taxa(
         for doc in taxa:
             if doc is not None:
                 taxon_data = data[doc["_source"]["taxon_id"]]
-                taxon_names = {}
+                taxon_names = []
                 attributes = []
                 for entry in taxon_data:
                     if "attributes" in entry:
                         attributes = attributes + entry["attributes"]
                     if "taxon_names" in entry:
-                        taxon_names.update(entry["taxon_names"])
+                        taxon_names += entry["taxon_names"]
                 if "taxon_names" not in doc["_source"]:
                     doc["_source"]["taxon_names"] = []
                 add_names_to_list(
diff --git a/src/genomehubs/templates/attributes.json b/src/genomehubs/templates/attributes.json
index 8f0239e4..9bf5ba34 100644
--- a/src/genomehubs/templates/attributes.json
+++ b/src/genomehubs/templates/attributes.json
@@ -12,6 +12,11 @@
         "ignore_above": 32,
         "meta": { "description": "Attribute name" }
       },
+      "display_name": {
+        "type": "keyword",
+        "index": false,
+        "meta": { "description": "Attribute display name" }
+      },
       "constraint": {
         "type": "object"
       },
diff --git a/src/genomehubs/templates/identifiers.json b/src/genomehubs/templates/identifiers.json
new file mode 100644
index 00000000..392d01e0
--- /dev/null
+++ b/src/genomehubs/templates/identifiers.json
@@ -0,0 +1,38 @@
+{
+  "index_patterns": ["identifiers-*"],
+  "mappings": {
+    "properties": {
+      "group": {
+        "type": "keyword",
+        "ignore_above": 16,
+        "meta": { "description": "Index group (e.g. assembly or taxon)" }
+      },
+      "name_class": {
+        "type": "keyword",
+        "ignore_above": 32,
+        "meta": { "description": "Name class" }
+      },
+      "display_name": {
+        "type": "keyword",
+        "index": false,
+        "meta": { "description": "Display name" }
+      },
+      "display_group": {
+        "type": "keyword",
+        "ignore_above": 32,
+        "null_value": "names",
+        "meta": { "description": "Display name" }
+      },
+      "source": {
+        "type": "keyword",
+        "ignore_above": 32,
+        "meta": { "description": "Source name" }
+      },
+      "source_url_stub": {
+        "type": "keyword",
+        "index": false,
+        "meta": { "description": "URL stub for xref" }
+      }
+    }
+  }
+}
diff --git a/src/genomehubs/templates/taxon.json b/src/genomehubs/templates/taxon.json
index dd606808..103d5a9d 100644
--- a/src/genomehubs/templates/taxon.json
+++ b/src/genomehubs/templates/taxon.json
@@ -80,6 +80,17 @@
             "meta": {
               "description": "Name class (e.g. common name, synonym, etc.)"
             }
+          },
+          "source": {
+            "type": "keyword",
+            "ignore_above": 64,
+            "normalizer": "lowercase",
+            "meta": { "description": "Source DB for taxon name" }
+          },
+          "source_url_stub": {
+            "type": "keyword",
+            "index": false,
+            "meta": { "description": "URL slug for taxon name xref" }
           }
         }
       },

From 3219c70eb4105a86d4e4018d412d1ae3f37092ba Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Fri, 19 Mar 2021 15:06:30 +0000
Subject: [PATCH 08/21] parse wikidata xrefs Fixes #55

---
 src/genomehubs/lib/btk.py                     |  2 +-
 src/genomehubs/lib/gbif.py                    |  2 +-
 src/genomehubs/lib/hub.py                     | 26 +++++--
 src/genomehubs/lib/ncbi.py                    |  4 +-
 src/genomehubs/lib/parse.py                   | 17 ++---
 src/genomehubs/lib/wikidata.py                | 68 +++++++++++--------
 .../{xref.names.yaml => wikidata.names.yaml}  |  8 ++-
 src/genomehubs/templates/xref.types.yaml      | 38 -----------
 8 files changed, 76 insertions(+), 89 deletions(-)
 rename src/genomehubs/templates/{xref.names.yaml => wikidata.names.yaml} (78%)
 delete mode 100644 src/genomehubs/templates/xref.types.yaml

diff --git a/src/genomehubs/lib/btk.py b/src/genomehubs/lib/btk.py
index c6ad1828..53a4d0eb 100644
--- a/src/genomehubs/lib/btk.py
+++ b/src/genomehubs/lib/btk.py
@@ -87,7 +87,7 @@ def describe_btk_files(meta):
     return files
 
 
-def btk_parser(_params, opts):
+def btk_parser(_params, opts, *args, **kwargs):
     """Parse BlobToolKit assemblies."""
     parsed = []
     analyses = []
diff --git a/src/genomehubs/lib/gbif.py b/src/genomehubs/lib/gbif.py
index c379e386..b31b6652 100644
--- a/src/genomehubs/lib/gbif.py
+++ b/src/genomehubs/lib/gbif.py
@@ -87,7 +87,7 @@ def fetch_gbif_identifiers(taxon, *, xrefs=None):
     return identifiers
 
 
-def gbif_parser(_params, opts):
+def gbif_parser(_params, opts, *args, **kwargs):
     """Parse GBIF taxa and identifiers."""
     parsed = []
     for root in opts["gbif-root"]:
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 2c842285..64831ed7 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -63,13 +63,16 @@ def index_templator(parts, opts):
     return template
 
 
-def order_parsed_fields(parsed, types, names=None):
-    """Order parsed fields using a template file."""
-    columns = {}
-    fields = {}
-    ctr = 0
-    types = deepcopy(types)
-    if names is not None:
+def add_names_to_types(names, types):
+    """Add names field meta to type field meta."""
+    sources = 0
+    if types is not None:
+        types = deepcopy(types)
+        sources += 1
+    elif names is not None:
+        types = deepcopy(names)
+        sources += 1
+    if sources == 2:
         for group, entries in names.items():
             if group not in types:
                 types[group] = deepcopy(entries)
@@ -80,6 +83,15 @@ def order_parsed_fields(parsed, types, names=None):
                             types[group][field] = deepcopy(attrs)
                         elif types[group][field]["header"] != attrs["header"]:
                             types[group]["names_%s" % field] = deepcopy(attrs)
+    return types
+
+
+def order_parsed_fields(parsed, types, names=None):
+    """Order parsed fields using a template file."""
+    columns = {}
+    fields = {}
+    ctr = 0
+    types = add_names_to_types(names, types)
     for group, entries in types.items():
         for field, attrs in entries.items():
             header = False
diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py
index 8ab16b1d..404409ae 100644
--- a/src/genomehubs/lib/ncbi.py
+++ b/src/genomehubs/lib/ncbi.py
@@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts):
     return parsed
 
 
-def refseq_organelle_parser(collections, opts):
+def refseq_organelle_parser(collections, opts, *args, **kwargs):
     """Fetch and parse RefSeq organelle collections."""
     parsed = []
     if isinstance(collections, tuple):
@@ -233,7 +233,7 @@ def parse_ncbi_datasets_record(record, parsed):
     parsed[obj["genbankAssmAccession"]] = obj
 
 
-def ncbi_genome_parser(directory, opts):
+def ncbi_genome_parser(directory, opts, *args, **kwargs):
     """Parse NCBI Datasets genome report."""
     parsed = {}
     with tofile.open_file_handle(
diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py
index cbef1219..758c2495 100644
--- a/src/genomehubs/lib/parse.py
+++ b/src/genomehubs/lib/parse.py
@@ -44,7 +44,6 @@
 
 from .btk import btk_parser
 from .config import config
-from .gbif import gbif_parser
 from .hub import load_types
 from .hub import order_parsed_fields
 from .ncbi import ncbi_genome_parser
@@ -56,7 +55,6 @@
 
 PARSERS = {
     "btk": {"func": btk_parser, "params": None, "types": "btk"},
-    "gbif": {"func": gbif_parser, "params": None, "types": "xref"},
     "ncbi-datasets-genome": {
         "func": ncbi_genome_parser,
         "params": None,
@@ -77,7 +75,7 @@
         "params": ("plastid"),
         "types": "organelle",
     },
-    "wikidata": {"func": wikidata_parser, "params": None, "types": "xref"},
+    "wikidata": {"func": wikidata_parser, "params": None, "types": "wikidata"},
 }
 
 
@@ -91,23 +89,26 @@ def main(args):
             if params is None:
                 params = options["parse"][option]
             LOGGER.info("Parsing %s" % option)
-            parsed = PARSERS[option]["func"](params, options["parse"])
+            types = load_types(PARSERS[option]["types"])
+            names = load_types(PARSERS[option]["types"], part="names")
+            parsed = PARSERS[option]["func"](
+                params, options["parse"], types=types, names=names
+            )
             files = []
             if isinstance(parsed, tuple):
                 parsed, files = parsed
-            types = load_types(PARSERS[option]["types"])
-            names = load_types(PARSERS[option]["types"], part="names")
             data = order_parsed_fields(parsed, types, names)
             tofile.write_file(options["parse"]["outfile"], data)
             filepath = Path(options["parse"]["outfile"])
-            types["file"]["name"] = filepath.name
             outdir = filepath.parent
             suff = re.compile(r"\.[^\.]+$")
             if filepath.name.endswith(".gz"):
                 stem = re.sub(suff, "", filepath.stem)
             else:
                 stem = filepath.stem
-            tofile.write_file("%s/%s.types.yaml" % (outdir, stem), types)
+            if types:
+                types["file"]["name"] = filepath.name
+                tofile.write_file("%s/%s.types.yaml" % (outdir, stem), types)
             if names:
                 names["file"]["name"] = filepath.name
                 tofile.write_file("%s/%s.names.yaml" % (outdir, stem), names)
diff --git a/src/genomehubs/lib/wikidata.py b/src/genomehubs/lib/wikidata.py
index 79a632f8..d3c81d07 100644
--- a/src/genomehubs/lib/wikidata.py
+++ b/src/genomehubs/lib/wikidata.py
@@ -46,28 +46,34 @@
 # }
 
 SOURCES = {
-    "BOLD": {
+    "bold": {
         "property": "P3606",
-        "source": "BOLD Systems taxon ID",
+        "source": "BOLD",
+        "display_name": "BOLD Systems taxon ID",
         "stub": "http://www.boldsystems.org/index.php/TaxBrowser_TaxonPage?taxid=",
     },
-    "GBIF": {
+    "gbif": {
         "property": "P846",
-        "source": "GBIF taxonKey",
+        "source": "GBIF",
+        "display_name": "GBIF taxonKey",
         "stub": "https://www.gbif.org/species/",
     },
-    "NBN": {
+    "nbn": {
         "property": "P3240",
-        "source": "NBN System Key",
+        "source": "NBN",
+        "display_name": "NBN System Key",
         "stub": "https://data.nbn.org.uk/Taxa/",
     },
-    "NCBI": {
+    "ncbi": {
         "property": "P685",
-        "source": "NCBI taxonomy ID",
+        "source": "NCBI",
+        "display_name": "NCBI taxonomy ID",
         "stub": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=",
     },
-    "WIKIDATA": {
-        "source": "Wikidata entity",
+    "wikidata": {
+        "property": None,
+        "source": "Wikidata",
+        "display_name": "Wikidata entity",
         "stub": "https://www.wikidata.org/wiki/",
     },
 }
@@ -189,17 +195,18 @@ def prepare_xref_rows(key, meta, entities):
         "subphylum",
         "phylum",
     ]
-    dbs = ["NCBI", "GBIF", "BOLD", "NBN"]
     lineage = meta["lineage"]
     rows = []
     common = {}
+    for db in SOURCES.keys():
+        common.update({db: "None"})
     entity = key.replace(WD, "")
     for rank in ranks:
         if rank in lineage:
             common.update({rank: lineage[rank]})
-    if SOURCES["NCBI"]["property"] in meta:
-        common.update({"ncbiTaxonId": meta[SOURCES["NCBI"]["property"]]})
-        common.update({"taxonId": meta[SOURCES["NCBI"]["property"]]})
+    if SOURCES["ncbi"]["property"] in meta:
+        common.update({"ncbiTaxonId": meta[SOURCES["ncbi"]["property"]]})
+        common.update({"taxonId": meta[SOURCES["ncbi"]["property"]]})
     else:
         common.update({"taxonId": entity})
     if "P225" in meta:
@@ -209,33 +216,36 @@ def prepare_xref_rows(key, meta, entities):
             common.update({rank: name})
     common.update({"wikidataTaxonId": entity})
     row = {**common}
-    row.update(
-        {
-            "xref": "%s:%s" % ("WIKIDATA", entity),
-            "source": SOURCES["WIKIDATA"]["source"],
-            "sourceStub": SOURCES["WIKIDATA"]["stub"],
-            "sourceSlug": entity,
-        }
-    )
-    rows.append(row)
-    for db in dbs:
-        if SOURCES[db]["property"] in meta:
+    for db in SOURCES.keys():
+        if SOURCES[db]["property"] is None or SOURCES[db]["property"] in meta:
             row = {**common}
-            slug = str(meta[SOURCES[db]["property"]])
+            if SOURCES[db]["property"] is None:
+                slug = entity
+            else:
+                slug = str(meta[SOURCES[db]["property"]])
             row.update(
                 {
-                    "xref": "%s:%s" % (db, slug),
+                    db: slug,
                     "source": SOURCES[db]["source"],
                     "sourceStub": SOURCES[db]["stub"],
-                    "sourceSlug": slug,
                 }
             )
             rows.append(row)
     return rows
 
 
-def wikidata_parser(_params, opts):
+def wikidata_parser(_params, opts, *, types=None, names=None):
     """Parse WikiData taxa and identifiers."""
+    if names is None:
+        names = {}
+    if "taxon_names" not in names:
+        names["taxon_names"] = {}
+    for db, values in SOURCES.items():
+        names["taxon_names"][db] = {
+            "display_name": values["display_name"],
+            "header": db,
+            "xref": True,
+        }
     parsed = []
     entities, ranks = fetch_wikidata_rank_entities()
     roots = opts.get("wikidata-root", None)
diff --git a/src/genomehubs/templates/xref.names.yaml b/src/genomehubs/templates/wikidata.names.yaml
similarity index 78%
rename from src/genomehubs/templates/xref.names.yaml
rename to src/genomehubs/templates/wikidata.names.yaml
index e2e2ddeb..a53bcc4f 100644
--- a/src/genomehubs/templates/xref.names.yaml
+++ b/src/genomehubs/templates/wikidata.names.yaml
@@ -1,9 +1,11 @@
 file:
   format: tsv
   header: true
-taxon_names:
-  wikidata_id:
-    header: wikidataTaxonId
+metadata:
+  source:
+    header: source
+  source_stub:
+    header: sourceStub
 taxonomy:
   taxon_id:
     header: ncbiTaxonId
diff --git a/src/genomehubs/templates/xref.types.yaml b/src/genomehubs/templates/xref.types.yaml
deleted file mode 100644
index 4c0ead06..00000000
--- a/src/genomehubs/templates/xref.types.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-file:
-  display_group: xref
-  format: tsv
-  header: true
-attributes:
-  xref:
-    description: External database references
-    display_group: xrefs
-    display_level: 1
-    display_name: DB xrefs
-    header: xref
-    summary: list
-    traverse: false
-    type: keyword
-metadata:
-  source:
-    header: source
-  source_slug:
-    header: sourceSlug
-  source_url_stub:
-    header: sourceStub
-taxonomy:
-  taxon_id:
-    header: taxonId
-  phylum:
-    header: phylum
-  class:
-    header: class
-  order:
-    header: order
-  family:
-    header: family
-  genus:
-    header: genus
-  species:
-    header: species
-  subspecies:
-    header: subspecies

From 48862b978c6f402a3534906d85e2cbbdc2dcc9d1 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Fri, 19 Mar 2021 15:47:25 +0000
Subject: [PATCH 09/21] add comment character(s) to ignore when indexing Fixes
 #54

---
 src/genomehubs/lib/index.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index c10266b2..5db8e5c9 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -84,11 +84,24 @@ def not_blank(key, obj, blanks):
     return key in obj and obj[key] and obj[key] not in blanks
 
 
+def strip_comments(data, types):
+    """Strip comment lines from a file stream."""
+    comment_chars = {"#"}
+    if "file" in types and "comment" in types["file"]:
+        comment_chars.update(set(types["file"]["comment"]))
+    for row in data:
+        if row[0] in comment_chars:
+            continue
+        yield row
+
+
 def index_file(es, types, data, opts):
     """Index a file."""
     delimiters = {"csv": ",", "tsv": "\t"}
     rows = csv.reader(
-        data, delimiter=delimiters[types["file"]["format"]], quotechar='"'
+        strip_comments(data, types),
+        delimiter=delimiters[types["file"]["format"]],
+        quotechar='"',
     )
     if "header" in types["file"] and types["file"]["header"]:
         header = next(rows)

From 2025061d04d03e981bf416afd283d74563cf262b Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Mon, 22 Mar 2021 14:01:04 +0000
Subject: [PATCH 10/21] Check spelling when indexing Fixes #58

---
 src/genomehubs/lib/fill.py  |   2 -
 src/genomehubs/lib/hub.py   |  49 ++++++++++-
 src/genomehubs/lib/index.py |  10 ++-
 src/genomehubs/lib/taxon.py | 161 ++++++++++++++++++++++++++++++------
 4 files changed, 194 insertions(+), 28 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index c9f9d416..01ce066d 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -430,8 +430,6 @@ def copy_attribute_summary(source, meta):
     try:
         dest["%s_value" % meta["type"]] = source["%s_value" % meta["type"]]
     except KeyError as err:
-        print(source)
-        print(meta)
         raise (err)
     dest["count"] = source["count"]
     dest["key"] = source["key"]
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 64831ed7..22cff7b2 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -562,6 +562,53 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
         for row in rows:
             data.append(row)
     LOGGER.info(
-        "Writing %d records to %s file '%s", len(data) - header_len, label, outfile
+        "Writing %d records to %s file '%s'", len(data) - header_len, label, outfile
     )
     tofile.write_file(outfile, data)
+
+
+def write_spellchecked_taxa(spellings, opts, *, types, header=None):
+    """Write spellchecked taxa to file."""
+    imported = []
+    exceptions = []
+    file_key = "%s-exception" % opts["index"]
+    dir_key = "%s-dir" % opts["index"]
+    filepath = Path(types["file"]["name"])
+    extensions = "".join(filepath.suffixes)
+    file_basename = str(filepath).replace(extensions, "")
+    for name, matches in spellings.items():
+        # enable test condition below if importing spellchecked taxa:
+        # if len(matches) == 1:
+        #     imported.append([name, matches[0]])
+        # else:
+        exceptions.append([name] + matches)
+    if imported:
+        label = "imported"
+        if file_key in opts and opts[file_key]:
+            outdir = opts[file_key]
+        else:
+            outdir = "%s/%s" % (opts[dir_key], label)
+        os.makedirs(outdir, exist_ok=True)
+        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+        LOGGER.info(
+            "Writing %d spelling corrections to %s file '%s'",
+            len(imported),
+            label,
+            outfile,
+        )
+        tofile.write_file(outfile, [["input", "corrected"]] + imported)
+    if exceptions:
+        label = "exceptions"
+        if file_key in opts and opts[file_key]:
+            outdir = opts[file_key]
+        else:
+            outdir = "%s/%s" % (opts[dir_key], label)
+        os.makedirs(outdir, exist_ok=True)
+        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+        LOGGER.info(
+            "Writing %d spelling suggestions to %s file '%s'",
+            len(exceptions),
+            label,
+            outfile,
+        )
+        tofile.write_file(outfile, [["input", "suggested"]] + exceptions)
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 5db8e5c9..32ccea37 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -9,7 +9,8 @@
                      [--es-host URL...] [--assembly-dir PATH]
                      [--assembly-repo URL] [--assembly-exception PATH]
                      [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH]
-                     [--taxon-lookup STRING] [--file PATH...] [file-dir PATH...]
+                     [--taxon-lookup STRING] [--taxon-spellcheck]
+                     [--file PATH...] [file-dir PATH...]
                      [--remote-file URL...] [--remote-file-dir URL...]
                      [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING]
                      [--file-title STRING] [--file-description STRING] [--file-metadata PATH]
@@ -26,7 +27,8 @@
     --assembly-repo URL        Remote git repository containing assembly-level data.
                                Optionally include `~branch-name` suffix.
     --assembly-exception PATH  Path to directory to write assembly data that failed to import.
-    --taxon-lookup STRING      Taxon name class to lookup (scientific|all). [Default: scientific]
+    --taxon-lookup STRING      Taxon name class to lookup (scientific|any). [Default: scientific]
+    --taxon-spellcheck         Flag to use fuzzy matching to match taxon names.
     --taxon-dir PATH           Path to directory containing taxon-level data.
     --taxon-repo URL           Remote git repository containing taxon-level data.
                                Optionally include `~branch-name` suffix.
@@ -72,6 +74,7 @@
 from .hub import set_column_indices
 from .hub import validate_types_file
 from .hub import write_imported_rows
+from .hub import write_spellchecked_taxa
 from .taxon import add_names_and_attributes_to_taxa
 from .taxon import fix_missing_ids
 from .version import __version__
@@ -162,6 +165,7 @@ def index_file(es, types, data, opts):
                 else:
                     failed_rows["None"].append(row)
         LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys()))
+        spellings = {}
         create_ids, without_ids = fix_missing_ids(
             es,
             opts,
@@ -173,7 +177,9 @@ def index_file(es, types, data, opts):
             with_ids=with_ids,
             blanks=blanks,
             header=header,
+            spellings=spellings,
         )
+        write_spellchecked_taxa(spellings, opts, types=types, header=header)
         if with_ids or create_ids:
             write_imported_rows(
                 imported_rows, opts, types=types, header=header, label="imported"
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index 4302c7c7..372eadfe 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -76,11 +76,13 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"):
 
 
 def lookup_missing_taxon_ids(
-    es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"])
+    es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None
 ):
     """Lookup taxon ID based on available taxonomic information."""
     if with_ids is None:
         with_ids = {}
+    if spellings is None:
+        spellings = {}
     # TODO: set this list from types file
     ranks = [
         "subspecies",
@@ -103,7 +105,7 @@ def lookup_missing_taxon_ids(
                 if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                     continue
                 taxon_ids, name_class = lookup_taxon(
-                    es, obj["taxonomy"][rank], opts, rank=rank
+                    es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings
                 )
                 if index == 1 and not taxon_ids:
                     break
@@ -180,15 +182,18 @@ def fix_missing_ids(
     with_ids=None,
     blanks=set(["NA", "None"]),
     header=None,
+    spellings=None,
 ):
     """Find or create taxon IDs for rows without."""
     if with_ids is None:
         with_ids = {}
+    if spellings is None:
+        spellings = {}
     if without_ids:
         # TODO: support multiple taxonomies
         LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys()))
         with_ids, without_ids, found_ids = lookup_missing_taxon_ids(
-            es, without_ids, opts, with_ids=with_ids, blanks=blanks
+            es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings
         )
         # create new taxon IDs
         if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
@@ -202,6 +207,7 @@ def fix_missing_ids(
                 data=without_ids,
                 blanks=blanks,
                 taxon_template=taxon_template,
+                spellings=spellings,
             )
             for created_id in created_ids:
                 if created_id in without_ids:
@@ -426,19 +432,53 @@ def lookup_taxon_within_lineage(
     return []
 
 
-def lookup_taxon(
-    es, name, opts, *, rank=None, name_class="scientific", return_type="taxon_id"
-):
-    """Lookup taxon ID."""
-    taxa = []
-    template = index_template(opts["taxonomy-source"][0], opts)
-    body = {
-        "id": "taxon_by_name",
-        "params": {"taxon": name, "rank": rank},
+def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, return_type):
+    """Look up taxon name with fuzzy matching."""
+    taxon_suggest = {
+        "id": "taxon_suggest",
+        "params": {"searchTerm": name, "max_errors": 3},
     }
-    if name_class == "any":
-        body.update({"id": "taxon_by_any_name"})
-    index = template["index_name"]
+    matches = None
+    with tolog.DisableLogger():
+        suggestions = es.search_template(
+            body=taxon_suggest, index=index, rest_total_hits_as_int=True
+        )
+        try:
+            options = suggestions["suggest"]["simple_phrase"][0]["options"]
+            matches = [
+                option["text"]
+                for option in options
+                if option.get("collate_match", False)
+            ]
+        except KeyError:
+            return None
+        except ValueError:
+            return None
+    if matches and len(matches) > 1:
+        taxon_matches = {}
+        scientific_name = None
+        for match in matches:
+            body = {
+                "id": "taxon_by_any_name",
+                "params": {"taxon": match, "rank": rank},
+            }
+            taxa = taxon_lookup(
+                es, body, index, taxonomy_index_template, opts, return_type="taxon"
+            )
+            if len(taxa) > 1:
+                return matches
+            for taxon in taxa:
+                source = taxon["_source"]
+                taxon_matches[source["taxon_id"]] = source["scientific_name"]
+                scientific_name = source["scientific_name"]
+        if len(taxon_matches.keys()) == 1:
+            return [scientific_name]
+    return matches
+
+
+def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type):
+    """Query elasticsearch for a taxon."""
+    taxa = []
     with tolog.DisableLogger():
         res = es.search_template(body=body, index=index, rest_total_hits_as_int=True)
     if "hits" in res and res["hits"]["total"] > 0:
@@ -458,9 +498,71 @@ def lookup_taxon(
                 taxa = [hit["_source"]["taxon_id"] for hit in res["hits"]["hits"]]
             else:
                 taxa = [hit for hit in res["hits"]["hits"]]
-    if not taxa and opts["taxon-lookup"] == "any" and name_class != "any":
+    return taxa
+
+
+def lookup_taxon(
+    es,
+    name,
+    opts,
+    *,
+    rank=None,
+    name_class="scientific",
+    return_type="taxon_id",
+    spellings=None,
+):
+    """Lookup taxon ID."""
+    if spellings is None:
+        spellings = {}
+    template = index_template(opts["taxonomy-source"][0], opts)
+    index = template["index_name"]
+    body = {
+        "id": "taxon_by_name",
+        "params": {"taxon": name, "rank": rank},
+    }
+    if name_class in {"any", "spellcheck"}:
+        body.update({"id": "taxon_by_any_name"})
+    if name_class == "spellcheck":
+        matches = spellcheck_taxon(
+            es, name, index, rank, taxonomy_index_template, opts, return_type
+        )
+        if matches:
+            spellings.update({name: matches})
+        return [], name_class
+        # Uncomment code blow to use suggestion in current import
+        # if matches and len(matches) == 1:
+        #     body["params"].update({"taxon": matches[0]})
+        # else:
+        #     return [], name_class
+    taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type)
+    if (
+        not taxa
+        and opts["taxon-lookup"] == "any"
+        and name_class not in {"any", "spellcheck"}
+    ):
         taxa, name_class = lookup_taxon(
-            es, name, opts, rank=rank, name_class="any", return_type=return_type
+            es,
+            name,
+            opts,
+            rank=rank,
+            name_class="any",
+            return_type=return_type,
+            spellings=spellings,
+        )
+    if (
+        not taxa
+        and "taxon-spellcheck" in opts
+        and opts["taxon-spellcheck"]
+        and name_class != "spellcheck"
+    ):
+        taxa, name_class = lookup_taxon(
+            es,
+            name,
+            opts,
+            rank=rank,
+            name_class="spellcheck",
+            return_type=return_type,
+            spellings=spellings,
         )
     return taxa, name_class
 
@@ -533,8 +635,8 @@ def add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon, *, blanks={"NA", "
     return new_taxon
 
 
-def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"])):
-    """Create new taxa using alternate taxon IDs."""
+def set_ranks(taxonomy):
+    """Set ranks for species/subspecies creation."""
     default_ranks = [
         "genus",
         "family",
@@ -543,6 +645,20 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
         "subphylum",
         "phylum",
     ]
+    if "subspecies" in taxonomy:
+        ranks = ["species"] + default_ranks
+    else:
+        ranks = default_ranks
+    return ranks
+
+
+def create_taxa(
+    es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None
+):
+    """Create new taxa using alternate taxon IDs."""
+    if spellings is None:
+        spellings = {}
+
     ancestors = {}
     matches = defaultdict(dict)
     pbar = tqdm(total=len(data.keys()))
@@ -556,15 +672,14 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
         lineage = []
         closest_rank = None
         closest_taxon = None
-        if "subspecies" in obj["taxonomy"]:
-            ranks = ["species"] + default_ranks
-        else:
-            ranks = default_ranks
+        ranks = set_ranks(obj["taxonomy"])
         max_index = len(ranks) - 1
         # max_rank = ranks[max_index]
         for index, rank in enumerate(ranks[: (max_index - 1)]):
             if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                 continue
+            if obj["taxonomy"][rank] in spellings:
+                break
             intermediates = 0
             for anc_rank in ranks[(index + 1) :]:
                 if (

From fcadf1523dff58e0d3166db85ea6f6ac8da8e31b Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Mon, 22 Mar 2021 15:08:55 +0000
Subject: [PATCH 11/21] ensure spellchecked taxon names are not indexed

---
 src/genomehubs/lib/taxon.py | 59 ++++++-------------------------------
 1 file changed, 9 insertions(+), 50 deletions(-)

diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index 372eadfe..be30670f 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -645,11 +645,17 @@ def set_ranks(taxonomy):
         "subphylum",
         "phylum",
     ]
+    taxon_rank = None
     if "subspecies" in taxonomy:
         ranks = ["species"] + default_ranks
+        taxon_rank = "subspecies"
     else:
         ranks = default_ranks
-    return ranks
+        for rank in ["species"] + default_ranks:
+            if rank in taxonomy:
+                taxon_rank = rank
+                break
+    return ranks, taxon_rank
 
 
 def create_taxa(
@@ -672,13 +678,13 @@ def create_taxa(
         lineage = []
         closest_rank = None
         closest_taxon = None
-        ranks = set_ranks(obj["taxonomy"])
+        ranks, taxon_rank = set_ranks(obj["taxonomy"])
         max_index = len(ranks) - 1
         # max_rank = ranks[max_index]
         for index, rank in enumerate(ranks[: (max_index - 1)]):
             if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                 continue
-            if obj["taxonomy"][rank] in spellings:
+            if obj["taxonomy"][taxon_rank] in spellings:
                 break
             intermediates = 0
             for anc_rank in ranks[(index + 1) :]:
@@ -765,50 +771,3 @@ def create_taxa(
         stream_taxa(new_taxa),
     )
     return new_taxa.keys()
-
-
-# def parse_taxa(es, types, taxonomy_template):
-#     """Test method to parse taxa."""
-#     taxa = [
-#         {
-#             "taxon_id": 110368,
-#             "assembly_span": 12344567,
-#             "c_value": 2.5,
-#             "sex_determination_system": "N/A",
-#         },
-#         {
-#             "taxon_id": 13037,
-#             "assembly_span": 2345678,
-#             "c_value": 2.3,
-#             "sex_determination_system": "XO",
-#         },
-#         {
-#             "taxon_id": 113334,
-#             "assembly_span": 45678912,
-#             "c_value": 4.6,
-#             "sex_determination_system": "XY",
-#         },
-#     ]
-#     for entry in taxa:
-#         # attributes = {}
-#         taxon_id = str(entry["taxon_id"])
-#         doc = lookup_taxon_by_taxid(es, taxon_id, taxonomy_template)
-#         if doc is None:
-#             LOGGER.warning(
-#                 "No %s taxonomy record for %s",
-#                 taxonomy_template["index_name"],
-#                 taxon_id,
-#             )
-#         attributes = add_attributes(entry, types, attributes=[])[0]
-#         doc.update({"taxon_id": taxon_id, "attributes": attributes})
-#         doc_id = "taxon_id-%s" % taxon_id
-#         yield doc_id, doc
-
-
-# def index(es, opts, *, taxonomy_name="ncbi"):
-#     """Index a set of taxa."""
-#     LOGGER.info("Indexing taxa using %s taxonomy", taxonomy_name)
-#     template = index_template(taxonomy_name, opts)
-#     taxonomy_template = taxonomy_index_template(taxonomy_name, opts)
-#     stream = parse_taxa(es, template["types"], taxonomy_template)
-#     return template, stream

From cbbe66ab9491b9158b2103c311a88573edc65b9f Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Mon, 22 Mar 2021 16:57:30 +0000
Subject: [PATCH 12/21] allow multiple limit ranks (#53)

---
 src/genomehubs/lib/fill.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index c9f9d416..3aff541a 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -255,7 +255,7 @@ def set_values_from_descendants(
             continue
         traverse_limit = meta[key].get("traverse_limit", None)
         # TODO: #53 catch traverse limits when limit rank is missing
-        if traverse_limit and taxon_rank == traverse_limit:
+        if traverse_limit and taxon_rank in traverse_limit:
             limits[key].add(parent)
         try:
             attribute = next(entry for entry in attributes if entry["key"] == key)
@@ -356,6 +356,11 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
         )
     root_depth = max_depth
     meta = template["types"]["attributes"]
+    for key, value in meta.items():
+        if "traverse_limit" in value:
+            if not isinstance(value["traverse_limit"], list):
+                value["traverse_limit"] = [value["traverse_limit"]]
+            value["traverse_limit"] = set(value["traverse_limit"])
     attrs = set(meta.keys())
     parents = defaultdict(
         lambda: defaultdict(

From c0c71aaec48130285ee96c56d31c2b6390370d0e Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Tue, 23 Mar 2021 09:57:41 +0000
Subject: [PATCH 13/21] track descendant ranks to fix leaky attributes Fixes
 #53

---
 src/genomehubs/lib/fill.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index 3aff541a..e333cf9f 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -234,6 +234,7 @@ def set_values_from_descendants(
     parent,
     taxon_rank,
     parents,
+    descendant_ranks=None,
     attr_dict=None,
     limits=None
 ):
@@ -249,14 +250,17 @@ def set_values_from_descendants(
             and meta[key]["traverse_direction"] == "down"
         ):
             traverseable = False
-        if not traverseable:
-            continue
-        if taxon_id in limits[key]:
+        if not traverseable or taxon_id in limits[key]:
             continue
         traverse_limit = meta[key].get("traverse_limit", None)
-        # TODO: #53 catch traverse limits when limit rank is missing
-        if traverse_limit and taxon_rank in traverse_limit:
-            limits[key].add(parent)
+        if traverse_limit:
+            if (
+                descendant_ranks is not None
+                and traverse_limit in descendant_ranks[taxon_id]
+            ):
+                continue
+            if taxon_rank == traverse_limit:
+                limits[key].add(parent)
         try:
             attribute = next(entry for entry in attributes if entry["key"] == key)
         except StopIteration:
@@ -344,6 +348,12 @@ def track_missing_attribute_values(
         )
 
 
+def track_descendant_ranks(node, descendant_ranks):
+    """Keep track of descendant ranks."""
+    if "parent" in node["_source"]:
+        descendant_ranks[node["_source"]["parent"]].add(node["_source"]["taxon_rank"])
+
+
 def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
     """Traverse a tree, filling in values."""
     if root is None:
@@ -356,11 +366,11 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
         )
     root_depth = max_depth
     meta = template["types"]["attributes"]
-    for key, value in meta.items():
-        if "traverse_limit" in value:
-            if not isinstance(value["traverse_limit"], list):
-                value["traverse_limit"] = [value["traverse_limit"]]
-            value["traverse_limit"] = set(value["traverse_limit"])
+    # for key, value in meta.items():
+    #     if "traverse_limit" in value:
+    #         if not isinstance(value["traverse_limit"], list):
+    #             value["traverse_limit"] = [value["traverse_limit"]]
+    #         value["traverse_limit"] = set(value["traverse_limit"])
     attrs = set(meta.keys())
     parents = defaultdict(
         lambda: defaultdict(
@@ -371,6 +381,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
     if "traverse-infer-both" in opts and opts["traverse-infer-both"]:
         desc_attrs, desc_attr_limits = set_attributes_to_descend(meta)
         missing_attributes = defaultdict(dict)
+        descendant_ranks = defaultdict(set)
     else:
         desc_attrs = {}
     while root_depth >= 0:
@@ -384,6 +395,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
         ctr = 0
         for node in nodes:
             # TODO: break into sub functions
+            track_descendant_ranks(node, descendant_ranks)
             ctr += 1
             changed = False
             attr_dict = {}
@@ -405,6 +417,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
                     taxon_id=node["_source"]["taxon_id"],
                     parent=node["_source"].get("parent", None),
                     parents=parents,
+                    descendant_ranks=descendant_ranks,
                     taxon_rank=node["_source"]["taxon_rank"],
                     attr_dict=attr_dict,
                     limits=limits,

From 05a968dc3ab002ead08a2cc0d86300191e97f720 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Tue, 23 Mar 2021 12:17:55 +0000
Subject: [PATCH 14/21] include ancestral rank in aggregation source Fixes #51

---
 src/genomehubs/lib/fill.py          | 10 +++++++---
 src/genomehubs/templates/taxon.json | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index a25ed1aa..17b7c1b4 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -323,10 +323,14 @@ def track_missing_attribute_values(
         for child_id, obj in missing_attributes[node["_source"]["taxon_id"]].items():
             for key, attribute in attr_dict.items():
                 if key in obj["keys"]:
-                    # update aggregation source here
-                    # TODO: #51 include ancestral rank in aggregation source
+                    # update aggregation source to include ancestral rank
                     obj["attributes"].append(
-                        {**attribute, "aggregation_source": "ancestor"}
+                        {
+                            **attribute,
+                            "aggregation_source": "ancestor",
+                            "aggregation_rank": node["_source"]["taxon_rank"],
+                            "aggregation_taxon_id": node["_source"]["taxon_id"],
+                        }
                     )
                     obj["keys"].remove(key)
             if obj["keys"]:
diff --git a/src/genomehubs/templates/taxon.json b/src/genomehubs/templates/taxon.json
index 103d5a9d..27f28cf5 100644
--- a/src/genomehubs/templates/taxon.json
+++ b/src/genomehubs/templates/taxon.json
@@ -280,6 +280,22 @@
               "description": "Summary source (direct, ancestor, descendant)"
             }
           },
+          "aggregation_rank": {
+            "type": "keyword",
+            "ignore_above": 16,
+            "normalizer": "lowercase",
+            "meta": {
+              "description": "Source rank for ancestor derived values"
+            }
+          },
+          "aggregation_taxon_id": {
+            "type": "keyword",
+            "ignore_above": 16,
+            "normalizer": "lowercase",
+            "meta": {
+              "description": "Source taxon_id for ancestor derived values"
+            }
+          },
           "comment": {
             "type": "text",
             "index": false

From d7b3f49094ede40e12e51a12fcbb8e919567af36 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Tue, 23 Mar 2021 15:09:16 +0000
Subject: [PATCH 15/21] Use preferred values in aggregation Fixes #62

---
 src/genomehubs/lib/fill.py | 43 +++++++++++++++++++++++++++++---------
 src/genomehubs/lib/hub.py  |  4 ++++
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index 17b7c1b4..2231ef52 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -119,7 +119,15 @@ def stream_descendant_nodes_missing_attributes(es, *, index, attributes, root, s
                 yield result
 
 
-def apply_summary(summary, values, *, max_value=None, min_value=None):
+def apply_summary(
+    summary,
+    values,
+    *,
+    primary_values=None,
+    summary_types=None,
+    max_value=None,
+    min_value=None
+):
     """Apply summary statistic functions."""
     summaries = {
         "count": len,
@@ -134,6 +142,10 @@ def apply_summary(summary, values, *, max_value=None, min_value=None):
         "list": list,
     }
     flattened = []
+    if summary == "primary":
+        if primary_values:
+            values = primary_values
+        summary = summary_types[0]
     for v in values:
         if isinstance(v, list):
             flattened += v
@@ -159,9 +171,14 @@ def summarise_attribute_values(
         return None, None, None
     if "summary" in meta:
         value_type = "%s_value" % meta["type"]
+        primary_values = []
         if "values" in attribute:
             if values is None:
-                values = [value[value_type] for value in attribute["values"]]
+                values = []
+                for value in attribute["values"]:
+                    values.append(value[value_type])
+                    if "is_primary_value" in value and value["is_primary_value"]:
+                        primary_values.append(value[value_type])
             else:
                 values += [value[value_type] for value in attribute["values"]]
         if not values:
@@ -171,16 +188,23 @@ def summarise_attribute_values(
         traverse_value = None
         if not isinstance(meta["summary"], list):
             meta["summary"] = [meta["summary"]]
-        for summary in meta["summary"]:
+        for index, summary in enumerate(meta["summary"]):
             value, max_value, min_value = apply_summary(
-                summary, values, max_value=max_value, min_value=min_value
+                summary,
+                values,
+                primary_values=primary_values,
+                summary_types=meta["summary"][index + 1 :] + ["median"],
+                max_value=max_value,
+                min_value=min_value,
             )
             if idx == 0:
-                attribute[value_type] = value
-                attribute["count"] = len(values)
-                attribute["aggregation_method"] = summary
-                attribute["aggregation_source"] = "direct"
-                traverse_value = value
+                if value is not None:
+                    attribute[value_type] = value
+                    attribute["count"] = len(values)
+                    attribute["aggregation_method"] = summary
+                    attribute["aggregation_source"] = "direct"
+                    traverse_value = value
+                idx += 1
             elif traverse and summary == traverse:
                 traverse_value = value
             if summary != "list":
@@ -188,7 +212,6 @@ def summarise_attribute_values(
                     summary = "median"
             else:
                 traverse_value = list(set(traverse_value))
-            idx += 1
         return traverse_value, max_value, min_value
     return None, None, None
 
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 22cff7b2..bd1d5cc0 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -506,6 +506,10 @@ def process_row(types, row):
                     raise err
     taxon_data = {}
     taxon_types = {}
+    if "is_primary_value" in data["metadata"]:
+        data["metadata"]["is_primary_value"] = bool(
+            int(data["metadata"]["is_primary_value"])
+        )
     for attr_type in list(["attributes", "identifiers"]):
         if attr_type in data and data[attr_type]:
             (

From bfddbc4aedaabfbfca588ff941c6b53955b6a5ac Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 24 Mar 2021 09:11:37 +0000
Subject: [PATCH 16/21] include max and min in attributes

---
 src/genomehubs/lib/fill.py | 43 +++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index 2231ef52..e3d77dc5 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -212,6 +212,9 @@ def summarise_attribute_values(
                     summary = "median"
             else:
                 traverse_value = list(set(traverse_value))
+        if isinstance(max_value, float) or isinstance(max_value, int):
+            attribute["max"] = max_value
+            attribute["min"] = min_value
         return traverse_value, max_value, min_value
     return None, None, None
 
@@ -238,13 +241,19 @@ def summarise_attributes(*, attributes, attrs, meta, parent, parents):
                             summary_value
                         )
                     if max_value is not None:
-                        parents[parent][node_attribute["key"]]["max"] = max(
-                            parents[parent][node_attribute["key"]]["max"], max_value
-                        )
+                        if parents[parent][node_attribute["key"]]["max"] is not None:
+                            parents[parent][node_attribute["key"]]["max"] = max(
+                                parents[parent][node_attribute["key"]]["max"], max_value
+                            )
+                        else:
+                            parents[parent][node_attribute["key"]]["max"] = max_value
                     if min_value is not None:
-                        parents[parent][node_attribute["key"]]["min"] = min(
-                            parents[parent][node_attribute["key"]]["min"], min_value
-                        )
+                        if parents[parent][node_attribute["key"]]["min"] is not None:
+                            parents[parent][node_attribute["key"]]["min"] = min(
+                                parents[parent][node_attribute["key"]]["min"], min_value
+                            )
+                        else:
+                            parents[parent][node_attribute["key"]]["min"] = min_value
     return changed, attr_dict
 
 
@@ -308,13 +317,19 @@ def set_values_from_descendants(
                 else:
                     parents[parent][key]["values"].append(summary_value)
                 if max_value is not None:
-                    parents[parent][key]["max"] = max(
-                        parents[parent][key]["max"], max_value
-                    )
+                    if parents[parent][key]["max"] is not None:
+                        parents[parent][key]["max"] = max(
+                            parents[parent][key]["max"], max_value
+                        )
+                    else:
+                        parents[parent][key]["max"] = max_value
                 if min_value is not None:
-                    parents[parent][key]["min"] = min(
-                        parents[parent][key]["min"], min_value
-                    )
+                    if parents[parent][key]["min"] is not None:
+                        parents[parent][key]["min"] = min(
+                            parents[parent][key]["min"], min_value
+                        )
+                    else:
+                        parents[parent][key]["min"] = min_value
     return changed, attr_dict
 
 
@@ -400,9 +415,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
     #         value["traverse_limit"] = set(value["traverse_limit"])
     attrs = set(meta.keys())
     parents = defaultdict(
-        lambda: defaultdict(
-            lambda: {"max": float("-inf"), "min": float("inf"), "values": []}
-        )
+        lambda: defaultdict(lambda: {"max": None, "min": None, "values": []})
     )
     limits = defaultdict(set)
     if "traverse-infer-both" in opts and opts["traverse-infer-both"]:

From c13edd5b9e25369d5a3c5ab1148038b1c77a6c31 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 24 Mar 2021 15:48:07 +0000
Subject: [PATCH 17/21] fix rows to imported and exceptions files

---
 src/genomehubs/lib/taxon.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index be30670f..b33b0a72 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -214,7 +214,7 @@ def fix_missing_ids(
                     with_ids[created_id] = without_ids[created_id]
                     found_ids[created_id] = True
                     del without_ids[created_id]
-    if without_ids and failed_rows:
+    if failed_rows:
         for key, value in found_ids.items():
             if key in failed_rows:
                 imported_rows += failed_rows[key]
@@ -670,11 +670,16 @@ def create_taxa(
     pbar = tqdm(total=len(data.keys()))
     taxon_ids = set({})
     new_taxa = {}
-    for alt_taxon_id, rows in data.items():
+    for rows in data.values():
         obj = rows[0]
         pbar.update(1)
-        if "taxonomy" not in obj:
+        if (
+            "taxonomy" not in obj
+            or "alt_taxon_id" not in obj["taxonomy"]
+            or obj["taxonomy"]["alt_taxon_id"] in blanks
+        ):
             continue
+        alt_taxon_id = obj["taxonomy"]["alt_taxon_id"]
         lineage = []
         closest_rank = None
         closest_taxon = None
@@ -684,7 +689,7 @@ def create_taxa(
         for index, rank in enumerate(ranks[: (max_index - 1)]):
             if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                 continue
-            if obj["taxonomy"][taxon_rank] in spellings:
+            if obj["taxonomy"][rank] in spellings:
                 break
             intermediates = 0
             for anc_rank in ranks[(index + 1) :]:

From 05cad18fd5db9ac8cde0e236dc3c2eecac9b1a4b Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Thu, 25 Mar 2021 10:37:48 +0000
Subject: [PATCH 18/21] fix alt_taxon_id spellcheck import/exceptions

---
 src/genomehubs/lib/taxon.py | 91 +++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 24 deletions(-)

diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index b33b0a72..8cf8ae14 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -658,6 +658,41 @@ def set_ranks(taxonomy):
     return ranks, taxon_rank
 
 
+def create_new_taxon(
+    alt_taxon_id,
+    closest_taxon,
+    closest_rank,
+    lineage,
+    new_taxa,
+    taxon_ids,
+    obj,
+    matches,
+    taxa,
+    ancestors,
+):
+    """Create a new taxon with new ancestral taxa as required."""
+    if closest_taxon is not None:
+        for intermediate in reversed(lineage):
+            taxon_id = generate_ancestral_taxon_id(
+                intermediate["name"],
+                intermediate["rank"],
+                alt_taxon_id=alt_taxon_id,
+                taxon_ids=taxon_ids,
+            )
+            new_taxon = create_descendant_taxon(
+                taxon_id, intermediate["rank"], intermediate["name"], closest_taxon
+            )
+            new_taxa.update({new_taxon["_source"]["taxon_id"]: new_taxon["_source"]})
+            matches[intermediate["name"]][obj["taxonomy"][closest_rank]] = taxa
+            closest_rank = intermediate["rank"]
+            closest_taxon = new_taxon
+        ancestors[alt_taxon_id] = closest_taxon
+        added_taxon = add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon)
+        matches[added_taxon["_source"]["scientific_name"]][
+            closest_taxon["_source"]["scientific_name"]
+        ] = [added_taxon]
+
+
 def create_taxa(
     es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None
 ):
@@ -678,34 +713,49 @@ def create_taxa(
             or "alt_taxon_id" not in obj["taxonomy"]
             or obj["taxonomy"]["alt_taxon_id"] in blanks
         ):
+            # row has no alt_taxon_id
             continue
         alt_taxon_id = obj["taxonomy"]["alt_taxon_id"]
         lineage = []
         closest_rank = None
         closest_taxon = None
+        # fetch ancestral ranks and current taxon rank
         ranks, taxon_rank = set_ranks(obj["taxonomy"])
+        if (
+            taxon_rank not in obj["taxonomy"]
+            or obj["taxonomy"][taxon_rank] in blanks
+            or obj["taxonomy"][taxon_rank] in spellings
+        ):
+            # taxon name is missing or may be mis-spelled
+            continue
         max_index = len(ranks) - 1
-        # max_rank = ranks[max_index]
+        # loop through lineage to find existing ancestral taxa
         for index, rank in enumerate(ranks[: (max_index - 1)]):
             if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
+                # row has no name at this rank
                 continue
             if obj["taxonomy"][rank] in spellings:
+                # ancestral taxon name is missing or may be mis-spelled
                 break
             intermediates = 0
+            # loop through higher ranks to disambiguate name clashes
             for anc_rank in ranks[(index + 1) :]:
                 if (
                     anc_rank not in obj["taxonomy"]
                     or obj["taxonomy"][anc_rank] in blanks
                 ):
+                    # row has no name at this rank
                     continue
                 if (
                     obj["taxonomy"][rank] in matches
                     and obj["taxonomy"][anc_rank] in matches[obj["taxonomy"][rank]]
                 ):
+                    # this taxon has been seen before
                     taxa = matches[obj["taxonomy"][rank]][obj["taxonomy"][anc_rank]]
                     ancestors.update({alt_taxon_id: taxa[0]})
                     break
                 else:
+                    #  find existing ancestral taxa within a lineage
                     taxa = lookup_taxon_within_lineage(
                         es,
                         obj["taxonomy"][rank],
@@ -717,10 +767,10 @@ def create_taxa(
                     )
                 if taxa:
                     if len(taxa) == 1:
+                        #  unambiguous match to a single existing taxon
                         ancestors.update({alt_taxon_id: taxa[0]})
                         matches[obj["taxonomy"][rank]][obj["taxonomy"][anc_rank]] = taxa
                         break
-                # elif anc_rank == max_rank and intermediates == 0:
                 elif intermediates == 0:
                     taxa, name_class = lookup_taxon(
                         es,
@@ -747,32 +797,25 @@ def create_taxa(
                     closest_taxon = matches[obj["taxonomy"][anc_rank]]["all"][0]
                 break
             lineage.append({"rank": rank, "name": obj["taxonomy"][rank]})
-        if closest_taxon is not None:
-            for intermediate in reversed(lineage):
-                taxon_id = generate_ancestral_taxon_id(
-                    intermediate["name"],
-                    intermediate["rank"],
-                    alt_taxon_id=alt_taxon_id,
-                    taxon_ids=taxon_ids,
-                )
-                new_taxon = create_descendant_taxon(
-                    taxon_id, intermediate["rank"], intermediate["name"], closest_taxon
-                )
-                new_taxa.update(
-                    {new_taxon["_source"]["taxon_id"]: new_taxon["_source"]}
-                )
-                matches[intermediate["name"]][obj["taxonomy"][closest_rank]] = taxa
-                closest_rank = intermediate["rank"]
-                closest_taxon = new_taxon
-            ancestors[alt_taxon_id] = closest_taxon
-            added_taxon = add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon)
-            matches[added_taxon["_source"]["scientific_name"]][
-                closest_taxon["_source"]["scientific_name"]
-            ] = [added_taxon]
+        # create a new taxon if a closest ancestral taxon could be found
+        create_new_taxon(
+            alt_taxon_id,
+            closest_taxon,
+            closest_rank,
+            lineage,
+            new_taxa,
+            taxon_ids,
+            obj,
+            matches,
+            taxa,
+            ancestors,
+        )
     pbar.close()
+    # add new taxa to the index
     index_stream(
         es,
         taxon_template["index_name"],
         stream_taxa(new_taxa),
     )
+    # return a list of alt_taxon_ids for the created taxa
     return new_taxa.keys()

From bde7c56aef9c35c7efae9dcea78c83224c2b97bc Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Tue, 6 Apr 2021 13:43:11 +0100
Subject: [PATCH 19/21] Set default traverse limit to class Fixes #64

---
 src/genomehubs/lib/fill.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index e3d77dc5..d042c554 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -6,7 +6,7 @@
 Usage:
     genomehubs fill [--hub-name STRING] [--hub-path PATH] [--hub-version PATH]
                     [--config-file PATH...] [--config-save PATH]
-                    [--es-host URL...]
+                    [--es-host URL...] [--traverse-limit STRING]
                     [--traverse-infer-ancestors] [--traverse-infer-descendants]
                     [--traverse-infer-both] [--traverse-threads INT]
                     [--traverse-depth INT] [--traverse-root STRING]
@@ -25,6 +25,7 @@
     --traverse-infer-descendants  Flag to enable tree traversal from root to tips.
     --traverse-infer-both         Flag to enable tree traversal from tips to root and
                                   back to tips.
+    --traverse-limit STRING       Maximum rank to ascend to during traversal. [Default: class]
     --traverse-root ID            Root taxon id for tree traversal.
     --traverse-threads INT        Number of threads to use for tree traversal. [Default: 1]
     --traverse-weight STRING      Weighting scheme for setting values during tree
@@ -265,6 +266,7 @@ def set_values_from_descendants(
     taxon_id,
     parent,
     taxon_rank,
+    traverse_limit,
     parents,
     descendant_ranks=None,
     attr_dict=None,
@@ -284,7 +286,7 @@ def set_values_from_descendants(
             traverseable = False
         if not traverseable or taxon_id in limits[key]:
             continue
-        traverse_limit = meta[key].get("traverse_limit", None)
+        traverse_limit = meta[key].get("traverse_limit", traverse_limit)
         if traverse_limit:
             if (
                 descendant_ranks is not None
@@ -333,7 +335,7 @@ def set_values_from_descendants(
     return changed, attr_dict
 
 
-def set_attributes_to_descend(meta):
+def set_attributes_to_descend(meta, traverse_limit):
     """Set which attributes should have values inferred from ancestral taxa."""
     desc_attrs = set()
     desc_attr_limits = {}
@@ -346,6 +348,8 @@ def set_attributes_to_descend(meta):
                 desc_attrs.add(key)
                 if "traverse_limit" in value:
                     desc_attr_limits.update({key: value["traverse_limit"]})
+                else:
+                    desc_attr_limits.update({key: traverse_limit})
     return desc_attrs, desc_attr_limits
 
 
@@ -408,18 +412,15 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
         )
     root_depth = max_depth
     meta = template["types"]["attributes"]
-    # for key, value in meta.items():
-    #     if "traverse_limit" in value:
-    #         if not isinstance(value["traverse_limit"], list):
-    #             value["traverse_limit"] = [value["traverse_limit"]]
-    #         value["traverse_limit"] = set(value["traverse_limit"])
     attrs = set(meta.keys())
     parents = defaultdict(
         lambda: defaultdict(lambda: {"max": None, "min": None, "values": []})
     )
     limits = defaultdict(set)
     if "traverse-infer-both" in opts and opts["traverse-infer-both"]:
-        desc_attrs, desc_attr_limits = set_attributes_to_descend(meta)
+        desc_attrs, desc_attr_limits = set_attributes_to_descend(
+            meta, opts["traverse-limit"]
+        )
         missing_attributes = defaultdict(dict)
         descendant_ranks = defaultdict(set)
     else:
@@ -459,6 +460,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None):
                     parents=parents,
                     descendant_ranks=descendant_ranks,
                     taxon_rank=node["_source"]["taxon_rank"],
+                    traverse_limit=opts["traverse-limit"],
                     attr_dict=attr_dict,
                     limits=limits,
                 )

From 32f55ab793c66fc3ee84b2c8c50203d4ba91ad2e Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 7 Apr 2021 09:21:24 +0100
Subject: [PATCH 20/21] Write taxon IDs to imported/exceptions files Fixes #66

---
 src/genomehubs/lib/hub.py   | 128 ++++++++++++++++++++++++++----------
 src/genomehubs/lib/index.py |  39 ++++++-----
 src/genomehubs/lib/taxon.py |  22 ++++---
 3 files changed, 132 insertions(+), 57 deletions(-)

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index bd1d5cc0..f0c3596f 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 """Hub functions."""
 
+import csv
 import os
 import re
 import sys
+from collections import defaultdict
 from copy import deepcopy
 from pathlib import Path
 
@@ -422,6 +424,36 @@ def add_attribute_values(existing, new, *, raw=True):
                 )
 
 
+def strip_comments(data, types):
+    """Strip comment lines from a file stream."""
+    comment_chars = {"#"}
+    if "file" in types and "comment" in types["file"]:
+        comment_chars.update(set(types["file"]["comment"]))
+    for row in data:
+        if row[0] in comment_chars:
+            continue
+        yield row
+
+
+def process_names_file(types, names_file):
+    """Process a taxon names file."""
+    data = tofile.open_file_handle(names_file)
+    names = defaultdict(dict)
+    if data is None:
+        return names
+    delimiters = {"csv": ",", "tsv": "\t"}
+    rows = csv.reader(
+        strip_comments(data, types),
+        delimiter=delimiters[types["file"]["format"]],
+        quotechar='"',
+    )
+    next(rows)
+    for row in rows:
+        name = row[3] if len(row) > 3 else row[1]
+        names[row[2]][row[1]] = {"name": name, "taxon_id": row[0]}
+    return names
+
+
 def validate_types_file(types_file, dir_path):
     """Validate types file."""
     try:
@@ -441,7 +473,8 @@ def validate_types_file(types_file, dir_path):
             defaults["metadata"].update({key: value})
     types.update({"defaults": defaults})
     data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"])
-    return types, data
+    names = process_names_file(types, Path(dir_path) / "names" / types["file"]["name"])
+    return types, data, names
 
 
 def set_xrefs(taxon_names, types, row, *, meta=None):
@@ -460,16 +493,8 @@ def set_xrefs(taxon_names, types, row, *, meta=None):
     return names
 
 
-def process_row(types, row):
-    """Process a row of data."""
-    data = {
-        "attributes": {},
-        "identifiers": {},
-        "metadata": {},
-        "taxon_names": {},
-        "taxonomy": {},
-        "taxon_attributes": {},
-    }
+def set_row_defaults(types, data):
+    """Set default values for a row."""
     for key in types["defaults"].keys():
         if key in types:
             for entry in types[key].values():
@@ -479,6 +504,10 @@ def process_row(types, row):
                 }
         elif key == "metadata":
             data["metadata"] = {**types["defaults"]["metadata"]}
+
+
+def process_row_values(row, types, data):
+    """Process row values."""
     for group in data.keys():
         if group in types:
             for key, meta in types[group].items():
@@ -504,6 +533,20 @@ def process_row(types, row):
                 except Exception as err:
                     LOGGER.warning("Cannot parse row '%s'" % str(row))
                     raise err
+
+
+def process_row(types, names, row):
+    """Process a row of data."""
+    data = {
+        "attributes": {},
+        "identifiers": {},
+        "metadata": {},
+        "taxon_names": {},
+        "taxonomy": {},
+        "taxon_attributes": {},
+    }
+    set_row_defaults(types, data)
+    process_row_values(row, types, data)
     taxon_data = {}
     taxon_types = {}
     if "is_primary_value" in data["metadata"]:
@@ -524,10 +567,18 @@ def process_row(types, row):
             )
         else:
             data[attr_type] = []
-    if "taxon_names" in data and data["taxon_names"]:
+    if data["taxon_names"]:
         data["taxon_names"] = set_xrefs(
             data["taxon_names"], types["taxon_names"], row, meta=data["metadata"]
         )
+    if data["taxonomy"] and names:
+        for key in names.keys():
+            if key in data["taxonomy"]:
+                if data["taxonomy"][key] in names[key]:
+                    data["taxonomy"]["taxon_id"] = names[key][data["taxonomy"][key]][
+                        "taxon_id"
+                    ]
+                    data["taxonomy"][key] = names[key][data["taxonomy"][key]]["name"]
     return data, taxon_data, taxon_types.get("attributes", {})
 
 
@@ -571,23 +622,18 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
     tofile.write_file(outfile, data)
 
 
-def write_spellchecked_taxa(spellings, opts, *, types, header=None):
+def write_spellchecked_taxa(spellings, opts, *, types):
     """Write spellchecked taxa to file."""
-    imported = []
     exceptions = []
     file_key = "%s-exception" % opts["index"]
     dir_key = "%s-dir" % opts["index"]
     filepath = Path(types["file"]["name"])
     extensions = "".join(filepath.suffixes)
     file_basename = str(filepath).replace(extensions, "")
-    for name, matches in spellings.items():
-        # enable test condition below if importing spellchecked taxa:
-        # if len(matches) == 1:
-        #     imported.append([name, matches[0]])
-        # else:
-        exceptions.append([name] + matches)
-    if imported:
-        label = "imported"
+    for name, obj in spellings.items():
+        exceptions.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"])
+    if exceptions:
+        label = "exceptions"
         if file_key in opts and opts[file_key]:
             outdir = opts[file_key]
         else:
@@ -595,24 +641,40 @@ def write_spellchecked_taxa(spellings, opts, *, types, header=None):
         os.makedirs(outdir, exist_ok=True)
         outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
         LOGGER.info(
-            "Writing %d spelling corrections to %s file '%s'",
-            len(imported),
+            "Writing %d spelling suggestions to %s file '%s'",
+            len(exceptions),
             label,
             outfile,
         )
-        tofile.write_file(outfile, [["input", "corrected"]] + imported)
-    if exceptions:
-        label = "exceptions"
+        tofile.write_file(
+            outfile, [["taxon_id", "input", "rank", "suggested"]] + exceptions
+        )
+
+
+def write_imported_taxa(taxa, opts, *, types):
+    """Write imported taxa to file."""
+    imported = []
+    file_key = "%s-exception" % opts["index"]
+    dir_key = "%s-dir" % opts["index"]
+    filepath = Path(types["file"]["name"])
+    extensions = "".join(filepath.suffixes)
+    file_basename = str(filepath).replace(extensions, "")
+    for name, arr in taxa.items():
+        prefix = "#" if len(arr) > 1 else ""
+        for obj in arr:
+            imported.append(
+                ["%s%s" % (prefix, str(obj["taxon_id"])), name, obj["rank"]]
+            )
+    if imported:
         if file_key in opts and opts[file_key]:
             outdir = opts[file_key]
         else:
-            outdir = "%s/%s" % (opts[dir_key], label)
+            outdir = "%s/imported" % opts[dir_key]
         os.makedirs(outdir, exist_ok=True)
-        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+        outfile = "%s/%s" % (outdir, "%s.taxon_ids.tsv" % file_basename)
         LOGGER.info(
-            "Writing %d spelling suggestions to %s file '%s'",
-            len(exceptions),
-            label,
+            "Writing %d taxon_ids to imported file '%s'",
+            len(imported),
             outfile,
         )
-        tofile.write_file(outfile, [["input", "suggested"]] + exceptions)
+        tofile.write_file(outfile, [["taxon_id", "input", "rank"]] + imported)
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 32ccea37..75766801 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -72,8 +72,10 @@
 from .files import index_metadata
 from .hub import process_row
 from .hub import set_column_indices
+from .hub import strip_comments
 from .hub import validate_types_file
 from .hub import write_imported_rows
+from .hub import write_imported_taxa
 from .hub import write_spellchecked_taxa
 from .taxon import add_names_and_attributes_to_taxa
 from .taxon import fix_missing_ids
@@ -87,18 +89,19 @@ def not_blank(key, obj, blanks):
     return key in obj and obj[key] and obj[key] not in blanks
 
 
-def strip_comments(data, types):
-    """Strip comment lines from a file stream."""
-    comment_chars = {"#"}
-    if "file" in types and "comment" in types["file"]:
-        comment_chars.update(set(types["file"]["comment"]))
-    for row in data:
-        if row[0] in comment_chars:
-            continue
-        yield row
+def summarise_imported_taxa(docs, imported_taxa):
+    """Summarise taxon imformation from a stram of taxon docs."""
+    for entry_id, entry in docs:
+        imported_taxa[entry["scientific_name"]].append(
+            {
+                "taxon_id": entry["taxon_id"],
+                "rank": entry["taxon_rank"],
+            }
+        )
+        yield entry_id, entry
 
 
-def index_file(es, types, data, opts):
+def index_file(es, types, names, data, opts):
     """Index a file."""
     delimiters = {"csv": ",", "tsv": "\t"}
     rows = csv.reader(
@@ -123,7 +126,9 @@ def index_file(es, types, data, opts):
         LOGGER.info("Processing rows")
         for row in tqdm(rows):
             try:
-                processed_data, taxon_data, new_taxon_types = process_row(types, row)
+                processed_data, taxon_data, new_taxon_types = process_row(
+                    types, names, row
+                )
             except Exception as err:
                 print(err)
                 failed_rows["None"].append(row)
@@ -179,7 +184,7 @@ def index_file(es, types, data, opts):
             header=header,
             spellings=spellings,
         )
-        write_spellchecked_taxa(spellings, opts, types=types, header=header)
+        write_spellchecked_taxa(spellings, opts, types=types)
         if with_ids or create_ids:
             write_imported_rows(
                 imported_rows, opts, types=types, header=header, label="imported"
@@ -189,12 +194,14 @@ def index_file(es, types, data, opts):
                 docs = add_names_and_attributes_to_taxa(
                     es, dict(with_ids), opts, template=taxon_template, blanks=blanks
                 )
+                imported_taxa = defaultdict(list)
                 index_stream(
                     es,
                     taxon_template["index_name"],
-                    docs,
+                    summarise_imported_taxa(docs, imported_taxa),
                     _op_type="update",
                 )
+                write_imported_taxa(imported_taxa, opts, types=types)
             elif opts["index"] == "assembly":
                 # TODO: keep track of taxon_id not found exceptions
                 assembly_template = assembly.index_template(taxonomy_name, opts)
@@ -244,22 +251,24 @@ def main(args):
         if data_dir in options["index"]:
             dir_path = options["index"][data_dir]
             for types_file in sorted(Path(dir_path).glob("*.names.yaml")):
-                types, data = validate_types_file(types_file, dir_path)
+                types, data, names = validate_types_file(types_file, dir_path)
                 LOGGER.info("Indexing %s" % types["file"]["name"])
                 index_types(es, index, types, options["index"])
                 index_file(
                     es,
                     types,
+                    names,
                     data,
                     {**options["index"], "index": index, "index_types": index_types},
                 )
             for types_file in sorted(Path(dir_path).glob("*.types.yaml")):
-                types, data = validate_types_file(types_file, dir_path)
+                types, data, names = validate_types_file(types_file, dir_path)
                 LOGGER.info("Indexing %s" % types["file"]["name"])
                 index_types(es, index, types, options["index"])
                 index_file(
                     es,
                     types,
+                    names,
                     data,
                     {**options["index"], "index": index, "index_types": index_types},
                 )
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index 8cf8ae14..ba93a25f 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -451,10 +451,11 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur
                 if option.get("collate_match", False)
             ]
         except KeyError:
-            return None
+            return None, rank, None
         except ValueError:
-            return None
-    if matches and len(matches) > 1:
+            return None, rank, None
+    taxon_id = None
+    if matches:
         taxon_matches = {}
         scientific_name = None
         for match in matches:
@@ -466,14 +467,15 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur
                 es, body, index, taxonomy_index_template, opts, return_type="taxon"
             )
             if len(taxa) > 1:
-                return matches
+                return None, rank, matches
             for taxon in taxa:
                 source = taxon["_source"]
-                taxon_matches[source["taxon_id"]] = source["scientific_name"]
+                taxon_id = source["taxon_id"]
+                taxon_matches[taxon_id] = source["scientific_name"]
                 scientific_name = source["scientific_name"]
         if len(taxon_matches.keys()) == 1:
-            return [scientific_name]
-    return matches
+            return taxon_id, rank, [scientific_name]
+    return None, rank, matches
 
 
 def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type):
@@ -523,11 +525,13 @@ def lookup_taxon(
     if name_class in {"any", "spellcheck"}:
         body.update({"id": "taxon_by_any_name"})
     if name_class == "spellcheck":
-        matches = spellcheck_taxon(
+        taxon_id, rank, matches = spellcheck_taxon(
             es, name, index, rank, taxonomy_index_template, opts, return_type
         )
         if matches:
-            spellings.update({name: matches})
+            spellings.update(
+                {name: {"matches": matches, "taxon_id": taxon_id, "rank": rank}}
+            )
         return [], name_class
         # Uncomment code blow to use suggestion in current import
         # if matches and len(matches) == 1:

From 073702eec6b19e9b6b8b31e5dec881d42c5da9f7 Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Wed, 7 Apr 2021 16:44:18 +0100
Subject: [PATCH 21/21] add in memory taxon name lookup Fixes #68

---
 src/genomehubs/lib/hub.py                     |  42 ++--
 src/genomehubs/lib/index.py                   | 227 +++++++++---------
 src/genomehubs/lib/taxon.py                   | 192 ++++++++++++---
 .../templates/scripts/taxon_names.json        |  18 ++
 .../scripts/taxon_names_by_root.json          |  26 ++
 5 files changed, 345 insertions(+), 160 deletions(-)
 create mode 100644 src/genomehubs/templates/scripts/taxon_names.json
 create mode 100644 src/genomehubs/templates/scripts/taxon_names_by_root.json

diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index f0c3596f..a068f9a1 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -624,31 +624,31 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
 
 def write_spellchecked_taxa(spellings, opts, *, types):
     """Write spellchecked taxa to file."""
-    exceptions = []
-    file_key = "%s-exception" % opts["index"]
     dir_key = "%s-dir" % opts["index"]
     filepath = Path(types["file"]["name"])
     extensions = "".join(filepath.suffixes)
     file_basename = str(filepath).replace(extensions, "")
-    for name, obj in spellings.items():
-        exceptions.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"])
-    if exceptions:
-        label = "exceptions"
-        if file_key in opts and opts[file_key]:
-            outdir = opts[file_key]
-        else:
-            outdir = "%s/%s" % (opts[dir_key], label)
-        os.makedirs(outdir, exist_ok=True)
-        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
-        LOGGER.info(
-            "Writing %d spelling suggestions to %s file '%s'",
-            len(exceptions),
-            label,
-            outfile,
-        )
-        tofile.write_file(
-            outfile, [["taxon_id", "input", "rank", "suggested"]] + exceptions
-        )
+    dirs = {
+        "spellcheck": "exceptions",
+        "synonym": "imported",
+    }
+    for group in dirs.keys():
+        taxa = []
+        for name, obj in spellings[group].items():
+            taxa.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"])
+        if taxa:
+            outdir = "%s/%s" % (opts[dir_key], dirs[group])
+            os.makedirs(outdir, exist_ok=True)
+            outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+            LOGGER.info(
+                "Writing %d %s suggestions to spellcheck file '%s'",
+                len(taxa),
+                group,
+                outfile,
+            )
+            tofile.write_file(
+                outfile, [["taxon_id", "input", "rank", "suggested"]] + taxa
+            )
 
 
 def write_imported_taxa(taxa, opts, *, types):
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 75766801..1b18840d 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -9,7 +9,9 @@
                      [--es-host URL...] [--assembly-dir PATH]
                      [--assembly-repo URL] [--assembly-exception PATH]
                      [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH]
-                     [--taxon-lookup STRING] [--taxon-spellcheck]
+                     [--taxon-lookup STRING] [--taxon-lookup-root STRING]
+                     [--taxon-lookup-in-memory]
+                     [--taxon-spellcheck]
                      [--file PATH...] [file-dir PATH...]
                      [--remote-file URL...] [--remote-file-dir URL...]
                      [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING]
@@ -27,7 +29,9 @@
     --assembly-repo URL        Remote git repository containing assembly-level data.
                                Optionally include `~branch-name` suffix.
     --assembly-exception PATH  Path to directory to write assembly data that failed to import.
+    --taxon-lookup-root STRING Root taxon Id for in-memory lookup.
     --taxon-lookup STRING      Taxon name class to lookup (scientific|any). [Default: scientific]
+    --taxon-lookup-in-memory   Flag to use in-memory taxon name lookup.
     --taxon-spellcheck         Flag to use fuzzy matching to match taxon names.
     --taxon-dir PATH           Path to directory containing taxon-level data.
     --taxon-repo URL           Remote git repository containing taxon-level data.
@@ -79,6 +83,7 @@
 from .hub import write_spellchecked_taxa
 from .taxon import add_names_and_attributes_to_taxa
 from .taxon import fix_missing_ids
+from .taxon import load_taxon_table
 from .version import __version__
 
 LOGGER = tolog.logger(__name__)
@@ -101,7 +106,7 @@ def summarise_imported_taxa(docs, imported_taxa):
         yield entry_id, entry
 
 
-def index_file(es, types, names, data, opts):
+def index_file(es, types, names, data, opts, *, taxon_table=None):
     """Index a file."""
     delimiters = {"csv": ",", "tsv": "\t"}
     rows = csv.reader(
@@ -121,118 +126,113 @@ def index_file(es, types, names, data, opts):
     imported_rows = []
     blanks = set(["", "NA", "N/A", "None"])
     taxon_types = {}
-    for taxonomy_name in opts["taxonomy-source"]:
-        taxon_template = taxon.index_template(taxonomy_name, opts)
-        LOGGER.info("Processing rows")
-        for row in tqdm(rows):
-            try:
-                processed_data, taxon_data, new_taxon_types = process_row(
-                    types, names, row
+    taxonomy_name = opts["taxonomy-source"][0]
+    taxon_template = taxon.index_template(taxonomy_name, opts)
+    LOGGER.info("Processing rows")
+    for row in tqdm(rows):
+        try:
+            processed_data, taxon_data, new_taxon_types = process_row(types, names, row)
+        except Exception as err:
+            print(err)
+            failed_rows["None"].append(row)
+            continue
+        taxon_types.update(new_taxon_types)
+        if not_blank("taxon_id", processed_data["taxonomy"], blanks):
+            with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data)
+            taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(taxon_data)
+            imported_rows.append(row)
+        else:
+            if "taxonomy" in types and not_blank(
+                "alt_taxon_id", processed_data["taxonomy"], blanks
+            ):
+                without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append(
+                    processed_data
                 )
-            except Exception as err:
-                print(err)
-                failed_rows["None"].append(row)
-                continue
-            taxon_types.update(new_taxon_types)
-            if not_blank("taxon_id", processed_data["taxonomy"], blanks):
-                with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data)
-                taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(
+                taxon_asm_data[processed_data["taxonomy"]["alt_taxon_id"]].append(
                     taxon_data
                 )
-                imported_rows.append(row)
-            else:
-                if "taxonomy" in types and not_blank(
-                    "alt_taxon_id", processed_data["taxonomy"], blanks
-                ):
-                    without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append(
-                        processed_data
-                    )
-                    taxon_asm_data[processed_data["taxonomy"]["alt_taxon_id"]].append(
-                        taxon_data
-                    )
-                    failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row)
-                elif not_blank("subspecies", processed_data["taxonomy"], blanks):
-                    without_ids[processed_data["taxonomy"]["subspecies"]].append(
-                        processed_data
-                    )
-                    taxon_asm_data[processed_data["taxonomy"]["subspecies"]].append(
-                        taxon_data
-                    )
-                    failed_rows[processed_data["taxonomy"]["subspecies"]].append(row)
-                elif not_blank("species", processed_data["taxonomy"], blanks):
-                    without_ids[processed_data["taxonomy"]["species"]].append(
-                        processed_data
-                    )
-                    taxon_asm_data[processed_data["taxonomy"]["species"]].append(
-                        taxon_data
-                    )
-                    failed_rows[processed_data["taxonomy"]["species"]].append(row)
-                else:
-                    failed_rows["None"].append(row)
-        LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys()))
-        spellings = {}
-        create_ids, without_ids = fix_missing_ids(
-            es,
-            opts,
-            without_ids,
-            types=types,
-            taxon_template=taxon_template,
-            failed_rows=failed_rows,
-            imported_rows=imported_rows,
-            with_ids=with_ids,
-            blanks=blanks,
-            header=header,
-            spellings=spellings,
-        )
-        write_spellchecked_taxa(spellings, opts, types=types)
-        if with_ids or create_ids:
-            write_imported_rows(
-                imported_rows, opts, types=types, header=header, label="imported"
-            )
-            LOGGER.info("Indexing %d entries", len(with_ids.keys()))
-            if opts["index"] == "taxon":
-                docs = add_names_and_attributes_to_taxa(
-                    es, dict(with_ids), opts, template=taxon_template, blanks=blanks
+                failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row)
+            elif not_blank("subspecies", processed_data["taxonomy"], blanks):
+                without_ids[processed_data["taxonomy"]["subspecies"]].append(
+                    processed_data
                 )
-                imported_taxa = defaultdict(list)
-                index_stream(
-                    es,
-                    taxon_template["index_name"],
-                    summarise_imported_taxa(docs, imported_taxa),
-                    _op_type="update",
-                )
-                write_imported_taxa(imported_taxa, opts, types=types)
-            elif opts["index"] == "assembly":
-                # TODO: keep track of taxon_id not found exceptions
-                assembly_template = assembly.index_template(taxonomy_name, opts)
-                docs = add_identifiers_and_attributes_to_assemblies(
-                    es,
-                    with_ids,
-                    opts,
-                    template=assembly_template,
-                    taxon_template=taxon_template,
-                    blanks=blanks,
-                )
-                index_stream(es, assembly_template["index_name"], docs)
-                # index taxon-level attributes
-                index_types(
-                    es,
-                    "taxon",
-                    {"attributes": taxon_types},
-                    opts,
-                )
-                taxon_asm_with_ids = {
-                    taxon_id: taxon_asm_data[taxon_id] for taxon_id in with_ids.keys()
-                }
-                taxon_docs = add_names_and_attributes_to_taxa(
-                    es, taxon_asm_with_ids, opts, template=taxon_template, blanks=blanks
+                taxon_asm_data[processed_data["taxonomy"]["subspecies"]].append(
+                    taxon_data
                 )
-                index_stream(
-                    es,
-                    taxon_template["index_name"],
-                    taxon_docs,
-                    _op_type="update",
+                failed_rows[processed_data["taxonomy"]["subspecies"]].append(row)
+            elif not_blank("species", processed_data["taxonomy"], blanks):
+                without_ids[processed_data["taxonomy"]["species"]].append(
+                    processed_data
                 )
+                taxon_asm_data[processed_data["taxonomy"]["species"]].append(taxon_data)
+                failed_rows[processed_data["taxonomy"]["species"]].append(row)
+            else:
+                failed_rows["None"].append(row)
+    LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys()))
+    spellings = {"spellcheck": {}, "synonym": {}}
+    create_ids, without_ids = fix_missing_ids(
+        es,
+        opts,
+        without_ids,
+        types=types,
+        taxon_template=taxon_template,
+        failed_rows=failed_rows,
+        imported_rows=imported_rows,
+        with_ids=with_ids,
+        blanks=blanks,
+        header=header,
+        spellings=spellings,
+        taxon_table=taxon_table,
+    )
+    write_spellchecked_taxa(spellings, opts, types=types)
+    if with_ids or create_ids:
+        write_imported_rows(
+            imported_rows, opts, types=types, header=header, label="imported"
+        )
+        LOGGER.info("Indexing %d entries", len(with_ids.keys()))
+        if opts["index"] == "taxon":
+            docs = add_names_and_attributes_to_taxa(
+                es, dict(with_ids), opts, template=taxon_template, blanks=blanks
+            )
+            imported_taxa = defaultdict(list)
+            index_stream(
+                es,
+                taxon_template["index_name"],
+                summarise_imported_taxa(docs, imported_taxa),
+                _op_type="update",
+            )
+            write_imported_taxa(imported_taxa, opts, types=types)
+        elif opts["index"] == "assembly":
+            # TODO: keep track of taxon_id not found exceptions
+            assembly_template = assembly.index_template(taxonomy_name, opts)
+            docs = add_identifiers_and_attributes_to_assemblies(
+                es,
+                with_ids,
+                opts,
+                template=assembly_template,
+                taxon_template=taxon_template,
+                blanks=blanks,
+            )
+            index_stream(es, assembly_template["index_name"], docs)
+            # index taxon-level attributes
+            index_types(
+                es,
+                "taxon",
+                {"attributes": taxon_types},
+                opts,
+            )
+            taxon_asm_with_ids = {
+                taxon_id: taxon_asm_data[taxon_id] for taxon_id in with_ids.keys()
+            }
+            taxon_docs = add_names_and_attributes_to_taxa(
+                es, taxon_asm_with_ids, opts, template=taxon_template, blanks=blanks
+            )
+            index_stream(
+                es,
+                taxon_template["index_name"],
+                taxon_docs,
+                _op_type="update",
+            )
 
 
 def main(args):
@@ -246,6 +246,14 @@ def main(args):
     with tolog.DisableLogger():
         hub.post_search_scripts(es)
 
+    taxonomy_name = options["index"]["taxonomy-source"][0]
+    taxon_table = None
+    if taxon_table is None and "taxon-lookup-in-memory" in options["index"]:
+        taxon_table = {
+            "scientific": defaultdict(list),
+            "any": defaultdict(list),
+        }
+        load_taxon_table(es, options["index"], taxonomy_name, taxon_table)
     for index in list(["taxon", "assembly"]):
         data_dir = "%s-dir" % index
         if data_dir in options["index"]:
@@ -260,6 +268,7 @@ def main(args):
                     names,
                     data,
                     {**options["index"], "index": index, "index_types": index_types},
+                    taxon_table=taxon_table,
                 )
             for types_file in sorted(Path(dir_path).glob("*.types.yaml")):
                 types, data, names = validate_types_file(types_file, dir_path)
@@ -271,9 +280,9 @@ def main(args):
                     names,
                     data,
                     {**options["index"], "index": index, "index_types": index_types},
+                    taxon_table=taxon_table,
                 )
     # TODO: #29 Implement alternate backbone taxonomies
-    taxonomy_name = options["index"]["taxonomy-source"][0]
     if "file" in options["index"]:
         index_files(es, options["index"]["file"], taxonomy_name, options["index"])
     elif "file-metadata" in options["index"]:
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index ba93a25f..abb64846 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -12,6 +12,7 @@
 from .es_functions import document_by_id
 from .es_functions import index_stream
 from .es_functions import query_value_template
+from .es_functions import stream_template_search_results
 from .hub import add_attribute_values
 from .hub import chunks
 from .hub import index_templator
@@ -76,13 +77,20 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"):
 
 
 def lookup_missing_taxon_ids(
-    es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None
+    es,
+    without_ids,
+    opts,
+    *,
+    with_ids=None,
+    blanks=set(["NA", "None"]),
+    spellings=None,
+    taxon_table=None,
 ):
     """Lookup taxon ID based on available taxonomic information."""
     if with_ids is None:
         with_ids = {}
     if spellings is None:
-        spellings = {}
+        spellings = {"spellcheck": {}, "synonym": {}}
     # TODO: set this list from types file
     ranks = [
         "subspecies",
@@ -104,21 +112,39 @@ def lookup_missing_taxon_ids(
             for index, rank in enumerate(ranks):
                 if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                     continue
-                taxon_ids, name_class = lookup_taxon(
-                    es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings
+                taxa, name_class = lookup_taxon(
+                    es,
+                    obj["taxonomy"][rank],
+                    opts,
+                    rank=rank,
+                    return_type="taxon",
+                    spellings=spellings,
+                    taxon_table=taxon_table,
                 )
-                if index == 1 and not taxon_ids:
+                if index == 1 and not taxa:
                     break
-                if len(taxon_ids) == 1:
-                    if taxon_ids[0] in with_ids:
-                        with_ids[taxon_ids[0]].append(obj)
+                if len(taxa) == 1:
+                    obj.update({"input_name": obj["taxonomy"][rank]})
+                    taxon = taxa[0]["_source"]
+                    if obj["taxonomy"][rank] != taxon["scientific_name"]:
+                        spellings["synonym"].update(
+                            {
+                                obj["taxonomy"][rank]: {
+                                    "matches": [taxon["scientific_name"]],
+                                    "taxon_id": taxon["taxon_id"],
+                                    "rank": rank,
+                                }
+                            }
+                        )
+                    if taxon["taxon_id"] in with_ids:
+                        with_ids[taxon["taxon_id"]].append(obj)
                     else:
                         obj["attributes"] = [obj["attributes"]]
-                        with_ids[taxon_ids[0]] = [obj]
+                        with_ids[taxon["taxon_id"]] = [obj]
                         LOGGER.debug(
                             "Matched %s with taxon_id %s",
                             obj["taxonomy"][rank],
-                            taxon_ids[0],
+                            taxon["taxon_id"],
                         )
                     found_keys.append(key)
                 break
@@ -170,6 +196,58 @@ def lookup_missing_taxon_ids(
     return with_ids, without_ids, found_ids
 
 
+def stream_taxon_names(es, *, index, root=None, size=1000):
+    """Get entries by depth of root taxon."""
+    if root is not None:
+        body = {
+            "id": "taxon_names_by_root",
+            "params": {"root": root},
+        }
+        return stream_template_search_results(es, index=index, body=body, size=size)
+    body = {
+        "id": "taxon_names",
+        "params": {},
+    }
+    return stream_template_search_results(es, index=index, body=body)
+
+
+def load_taxon_table(es, opts, taxonomy_name, taxon_table):
+    """Load all taxa into memory for taxon name lookup and spellcheck."""
+    LOGGER.info("Loading taxa into memory for taxon name lookup")
+    taxon_template = index_template(taxonomy_name, opts)
+    root = None
+    if "taxon-lookup-root" in opts:
+        root = opts["taxon-lookup-root"]
+    for node in tqdm(
+        stream_taxon_names(es, index=taxon_template["index_name"], root=root)
+    ):
+        lineage = {}
+        node_names = set()
+        try:
+            if "attributes" in node["_source"]:
+                attributes = node["_source"]["attributes"]
+            else:
+                attributes = []
+            for anc in node["_source"]["lineage"]:
+                lineage.update({anc["taxon_rank"]: anc["scientific_name"]})
+            taxon = {
+                "taxon_id": node["_source"]["taxon_id"],
+                "taxon_rank": node["_source"]["taxon_rank"],
+                "scientific_name": node["_source"]["scientific_name"],
+                "lineage": lineage,
+                "attributes": attributes,
+            }
+            taxon_table["scientific"][node["_source"]["scientific_name"]].append(taxon)
+            taxon_table["any"][node["_source"]["scientific_name"]].append(taxon)
+            node_names.add(node["_source"]["scientific_name"])
+            for obj in node["_source"]["taxon_names"]:
+                if obj["name"] not in node_names:
+                    node_names.add(obj["name"])
+                    taxon_table["any"][obj["name"]].append(taxon)
+        except KeyError:
+            pass
+
+
 def fix_missing_ids(
     es,
     opts,
@@ -183,17 +261,24 @@ def fix_missing_ids(
     blanks=set(["NA", "None"]),
     header=None,
     spellings=None,
+    taxon_table=None,
 ):
     """Find or create taxon IDs for rows without."""
     if with_ids is None:
         with_ids = {}
     if spellings is None:
-        spellings = {}
+        spellings = {"spellcheck": {}, "synonym": {}}
     if without_ids:
         # TODO: support multiple taxonomies
         LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys()))
         with_ids, without_ids, found_ids = lookup_missing_taxon_ids(
-            es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings
+            es,
+            without_ids,
+            opts,
+            with_ids=with_ids,
+            blanks=blanks,
+            spellings=spellings,
+            taxon_table=taxon_table,
         )
         # create new taxon IDs
         if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
@@ -436,7 +521,7 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur
     """Look up taxon name with fuzzy matching."""
     taxon_suggest = {
         "id": "taxon_suggest",
-        "params": {"searchTerm": name, "max_errors": 3},
+        "params": {"searchTerm": name},
     }
     matches = None
     with tolog.DisableLogger():
@@ -503,19 +588,17 @@ def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type):
     return taxa
 
 
-def lookup_taxon(
+def lookup_taxon_in_index(
     es,
     name,
     opts,
     *,
-    rank=None,
-    name_class="scientific",
-    return_type="taxon_id",
-    spellings=None,
+    rank,
+    name_class,
+    return_type,
+    spellings,
 ):
-    """Lookup taxon ID."""
-    if spellings is None:
-        spellings = {}
+    """Lookup taxon in Elasticsearch index."""
     template = index_template(opts["taxonomy-source"][0], opts)
     index = template["index_name"]
     body = {
@@ -529,16 +612,63 @@ def lookup_taxon(
             es, name, index, rank, taxonomy_index_template, opts, return_type
         )
         if matches:
-            spellings.update(
+            spellings["spellcheck"].update(
                 {name: {"matches": matches, "taxon_id": taxon_id, "rank": rank}}
             )
         return [], name_class
-        # Uncomment code blow to use suggestion in current import
-        # if matches and len(matches) == 1:
-        #     body["params"].update({"taxon": matches[0]})
-        # else:
-        #     return [], name_class
     taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type)
+    return taxa
+
+
+def lookup_taxon_in_memory(
+    name, opts, *, rank, name_class, return_type, spellings, taxon_table
+):
+    """Lookup taxon in memory."""
+    taxa = []
+    if name_class in taxon_table:
+        if name in taxon_table[name_class]:
+            for obj in taxon_table[name_class][name]:
+                if return_type == "taxon_id":
+                    taxa.append(obj["taxon_id"])
+                else:
+                    taxa.append({"_source": {**obj}})
+    return taxa
+
+
+def lookup_taxon(
+    es,
+    name,
+    opts,
+    *,
+    rank=None,
+    name_class="scientific",
+    return_type="taxon_id",
+    spellings=None,
+    taxon_table=None,
+):
+    """Lookup taxon ID."""
+    if spellings is None:
+        spellings = {"spellcheck": {}, "synonym": {}}
+    if taxon_table is None or name_class == "spellcheck":
+        taxa = lookup_taxon_in_index(
+            es,
+            name,
+            opts,
+            rank=rank,
+            name_class=name_class,
+            return_type=return_type,
+            spellings=spellings,
+        )
+    else:
+        taxa = lookup_taxon_in_memory(
+            name,
+            opts,
+            rank=rank,
+            name_class=name_class,
+            return_type=return_type,
+            spellings=spellings,
+            taxon_table=taxon_table,
+        )
     if (
         not taxa
         and opts["taxon-lookup"] == "any"
@@ -552,6 +682,7 @@ def lookup_taxon(
             name_class="any",
             return_type=return_type,
             spellings=spellings,
+            taxon_table=taxon_table,
         )
     if (
         not taxa
@@ -567,6 +698,7 @@ def lookup_taxon(
             name_class="spellcheck",
             return_type=return_type,
             spellings=spellings,
+            taxon_table=taxon_table,
         )
     return taxa, name_class
 
@@ -702,8 +834,7 @@ def create_taxa(
 ):
     """Create new taxa using alternate taxon IDs."""
     if spellings is None:
-        spellings = {}
-
+        spellings = {"spellcheck": {}, "synonym": {}}
     ancestors = {}
     matches = defaultdict(dict)
     pbar = tqdm(total=len(data.keys()))
@@ -759,7 +890,8 @@ def create_taxa(
                     ancestors.update({alt_taxon_id: taxa[0]})
                     break
                 else:
-                    #  find existing ancestral taxa within a lineage
+                    # find existing ancestral taxa within a lineage
+                    # TODO: make an in memory version of this lookup
                     taxa = lookup_taxon_within_lineage(
                         es,
                         obj["taxonomy"][rank],
diff --git a/src/genomehubs/templates/scripts/taxon_names.json b/src/genomehubs/templates/scripts/taxon_names.json
new file mode 100644
index 00000000..19c9c128
--- /dev/null
+++ b/src/genomehubs/templates/scripts/taxon_names.json
@@ -0,0 +1,18 @@
+{
+  "script": {
+    "lang": "mustache",
+    "source": {
+      "query": {
+        "match_all": {}
+      },
+      "_source": [
+        "taxon_id",
+        "taxon_rank",
+        "scientific_name",
+        "lineage.*",
+        "taxon_names.*",
+        "attributes"
+      ]
+    }
+  }
+}
diff --git a/src/genomehubs/templates/scripts/taxon_names_by_root.json b/src/genomehubs/templates/scripts/taxon_names_by_root.json
new file mode 100644
index 00000000..0fc44f7b
--- /dev/null
+++ b/src/genomehubs/templates/scripts/taxon_names_by_root.json
@@ -0,0 +1,26 @@
+{
+  "script": {
+    "lang": "mustache",
+    "source": {
+      "query": {
+        "nested": {
+          "path": "lineage",
+          "query": {
+            "multi_match": {
+              "query": "{{root}}",
+              "fields": ["lineage.taxon_id", "lineage.scientific_name"]
+            }
+          }
+        }
+      },
+      "_source": [
+        "taxon_id",
+        "taxon_rank",
+        "scientific_name",
+        "lineage.*",
+        "taxon_names.*",
+        "attributes"
+      ]
+    }
+  }
+}