From 2025061d04d03e981bf416afd283d74563cf262b Mon Sep 17 00:00:00 2001
From: Richard Challis <rjchallis@gmail.com>
Date: Mon, 22 Mar 2021 14:01:04 +0000
Subject: [PATCH] Check spelling when indexing Fixes #58

---
 src/genomehubs/lib/fill.py  |   2 -
 src/genomehubs/lib/hub.py   |  49 ++++++++++-
 src/genomehubs/lib/index.py |  10 ++-
 src/genomehubs/lib/taxon.py | 161 ++++++++++++++++++++++++++++++------
 4 files changed, 194 insertions(+), 28 deletions(-)

diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py
index c9f9d416..01ce066d 100644
--- a/src/genomehubs/lib/fill.py
+++ b/src/genomehubs/lib/fill.py
@@ -430,8 +430,6 @@ def copy_attribute_summary(source, meta):
     try:
         dest["%s_value" % meta["type"]] = source["%s_value" % meta["type"]]
     except KeyError as err:
-        print(source)
-        print(meta)
         raise (err)
     dest["count"] = source["count"]
     dest["key"] = source["key"]
diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py
index 64831ed7..22cff7b2 100644
--- a/src/genomehubs/lib/hub.py
+++ b/src/genomehubs/lib/hub.py
@@ -562,6 +562,53 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
         for row in rows:
             data.append(row)
     LOGGER.info(
-        "Writing %d records to %s file '%s", len(data) - header_len, label, outfile
+        "Writing %d records to %s file '%s'", len(data) - header_len, label, outfile
     )
     tofile.write_file(outfile, data)
+
+
+def write_spellchecked_taxa(spellings, opts, *, types, header=None):
+    """Write spellchecked taxa to file."""
+    imported = []
+    exceptions = []
+    file_key = "%s-exception" % opts["index"]
+    dir_key = "%s-dir" % opts["index"]
+    filepath = Path(types["file"]["name"])
+    extensions = "".join(filepath.suffixes)
+    file_basename = str(filepath).replace(extensions, "")
+    for name, matches in spellings.items():
+        # enable test condition below if importing spellchecked taxa:
+        # if len(matches) == 1:
+        #     imported.append([name, matches[0]])
+        # else:
+        exceptions.append([name] + matches)
+    if imported:
+        label = "imported"
+        if file_key in opts and opts[file_key]:
+            outdir = opts[file_key]
+        else:
+            outdir = "%s/%s" % (opts[dir_key], label)
+        os.makedirs(outdir, exist_ok=True)
+        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+        LOGGER.info(
+            "Writing %d spelling corrections to %s file '%s'",
+            len(imported),
+            label,
+            outfile,
+        )
+        tofile.write_file(outfile, [["input", "corrected"]] + imported)
+    if exceptions:
+        label = "exceptions"
+        if file_key in opts and opts[file_key]:
+            outdir = opts[file_key]
+        else:
+            outdir = "%s/%s" % (opts[dir_key], label)
+        os.makedirs(outdir, exist_ok=True)
+        outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
+        LOGGER.info(
+            "Writing %d spelling suggestions to %s file '%s'",
+            len(exceptions),
+            label,
+            outfile,
+        )
+        tofile.write_file(outfile, [["input", "suggested"]] + exceptions)
diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py
index 5db8e5c9..32ccea37 100644
--- a/src/genomehubs/lib/index.py
+++ b/src/genomehubs/lib/index.py
@@ -9,7 +9,8 @@
                      [--es-host URL...] [--assembly-dir PATH]
                      [--assembly-repo URL] [--assembly-exception PATH]
                      [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH]
-                     [--taxon-lookup STRING] [--file PATH...] [file-dir PATH...]
+                     [--taxon-lookup STRING] [--taxon-spellcheck]
+                     [--file PATH...] [file-dir PATH...]
                      [--remote-file URL...] [--remote-file-dir URL...]
                      [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING]
                      [--file-title STRING] [--file-description STRING] [--file-metadata PATH]
@@ -26,7 +27,8 @@
     --assembly-repo URL        Remote git repository containing assembly-level data.
                                Optionally include `~branch-name` suffix.
     --assembly-exception PATH  Path to directory to write assembly data that failed to import.
-    --taxon-lookup STRING      Taxon name class to lookup (scientific|all). [Default: scientific]
+    --taxon-lookup STRING      Taxon name class to lookup (scientific|any). [Default: scientific]
+    --taxon-spellcheck         Flag to use fuzzy matching to match taxon names.
     --taxon-dir PATH           Path to directory containing taxon-level data.
     --taxon-repo URL           Remote git repository containing taxon-level data.
                                Optionally include `~branch-name` suffix.
@@ -72,6 +74,7 @@
 from .hub import set_column_indices
 from .hub import validate_types_file
 from .hub import write_imported_rows
+from .hub import write_spellchecked_taxa
 from .taxon import add_names_and_attributes_to_taxa
 from .taxon import fix_missing_ids
 from .version import __version__
@@ -162,6 +165,7 @@ def index_file(es, types, data, opts):
                 else:
                     failed_rows["None"].append(row)
         LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys()))
+        spellings = {}
         create_ids, without_ids = fix_missing_ids(
             es,
             opts,
@@ -173,7 +177,9 @@ def index_file(es, types, data, opts):
             with_ids=with_ids,
             blanks=blanks,
             header=header,
+            spellings=spellings,
         )
+        write_spellchecked_taxa(spellings, opts, types=types, header=header)
         if with_ids or create_ids:
             write_imported_rows(
                 imported_rows, opts, types=types, header=header, label="imported"
diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py
index 4302c7c7..372eadfe 100644
--- a/src/genomehubs/lib/taxon.py
+++ b/src/genomehubs/lib/taxon.py
@@ -76,11 +76,13 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"):
 
 
 def lookup_missing_taxon_ids(
-    es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"])
+    es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None
 ):
     """Lookup taxon ID based on available taxonomic information."""
     if with_ids is None:
         with_ids = {}
+    if spellings is None:
+        spellings = {}
     # TODO: set this list from types file
     ranks = [
         "subspecies",
@@ -103,7 +105,7 @@ def lookup_missing_taxon_ids(
                 if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                     continue
                 taxon_ids, name_class = lookup_taxon(
-                    es, obj["taxonomy"][rank], opts, rank=rank
+                    es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings
                 )
                 if index == 1 and not taxon_ids:
                     break
@@ -180,15 +182,18 @@ def fix_missing_ids(
     with_ids=None,
     blanks=set(["NA", "None"]),
     header=None,
+    spellings=None,
 ):
     """Find or create taxon IDs for rows without."""
     if with_ids is None:
         with_ids = {}
+    if spellings is None:
+        spellings = {}
     if without_ids:
         # TODO: support multiple taxonomies
         LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys()))
         with_ids, without_ids, found_ids = lookup_missing_taxon_ids(
-            es, without_ids, opts, with_ids=with_ids, blanks=blanks
+            es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings
         )
         # create new taxon IDs
         if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
@@ -202,6 +207,7 @@ def fix_missing_ids(
                 data=without_ids,
                 blanks=blanks,
                 taxon_template=taxon_template,
+                spellings=spellings,
             )
             for created_id in created_ids:
                 if created_id in without_ids:
@@ -426,19 +432,53 @@ def lookup_taxon_within_lineage(
     return []
 
 
-def lookup_taxon(
-    es, name, opts, *, rank=None, name_class="scientific", return_type="taxon_id"
-):
-    """Lookup taxon ID."""
-    taxa = []
-    template = index_template(opts["taxonomy-source"][0], opts)
-    body = {
-        "id": "taxon_by_name",
-        "params": {"taxon": name, "rank": rank},
+def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, return_type):
+    """Look up taxon name with fuzzy matching."""
+    taxon_suggest = {
+        "id": "taxon_suggest",
+        "params": {"searchTerm": name, "max_errors": 3},
     }
-    if name_class == "any":
-        body.update({"id": "taxon_by_any_name"})
-    index = template["index_name"]
+    matches = None
+    with tolog.DisableLogger():
+        suggestions = es.search_template(
+            body=taxon_suggest, index=index, rest_total_hits_as_int=True
+        )
+        try:
+            options = suggestions["suggest"]["simple_phrase"][0]["options"]
+            matches = [
+                option["text"]
+                for option in options
+                if option.get("collate_match", False)
+            ]
+        except KeyError:
+            return None
+        except ValueError:
+            return None
+    if matches and len(matches) > 1:
+        taxon_matches = {}
+        scientific_name = None
+        for match in matches:
+            body = {
+                "id": "taxon_by_any_name",
+                "params": {"taxon": match, "rank": rank},
+            }
+            taxa = taxon_lookup(
+                es, body, index, taxonomy_index_template, opts, return_type="taxon"
+            )
+            if len(taxa) > 1:
+                return matches
+            for taxon in taxa:
+                source = taxon["_source"]
+                taxon_matches[source["taxon_id"]] = source["scientific_name"]
+                scientific_name = source["scientific_name"]
+        if len(taxon_matches.keys()) == 1:
+            return [scientific_name]
+    return matches
+
+
+def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type):
+    """Query elasticsearch for a taxon."""
+    taxa = []
     with tolog.DisableLogger():
         res = es.search_template(body=body, index=index, rest_total_hits_as_int=True)
     if "hits" in res and res["hits"]["total"] > 0:
@@ -458,9 +498,71 @@ def lookup_taxon(
                 taxa = [hit["_source"]["taxon_id"] for hit in res["hits"]["hits"]]
             else:
                 taxa = [hit for hit in res["hits"]["hits"]]
-    if not taxa and opts["taxon-lookup"] == "any" and name_class != "any":
+    return taxa
+
+
+def lookup_taxon(
+    es,
+    name,
+    opts,
+    *,
+    rank=None,
+    name_class="scientific",
+    return_type="taxon_id",
+    spellings=None,
+):
+    """Lookup taxon ID."""
+    if spellings is None:
+        spellings = {}
+    template = index_template(opts["taxonomy-source"][0], opts)
+    index = template["index_name"]
+    body = {
+        "id": "taxon_by_name",
+        "params": {"taxon": name, "rank": rank},
+    }
+    if name_class in {"any", "spellcheck"}:
+        body.update({"id": "taxon_by_any_name"})
+    if name_class == "spellcheck":
+        matches = spellcheck_taxon(
+            es, name, index, rank, taxonomy_index_template, opts, return_type
+        )
+        if matches:
+            spellings.update({name: matches})
+        return [], name_class
+        # Uncomment code blow to use suggestion in current import
+        # if matches and len(matches) == 1:
+        #     body["params"].update({"taxon": matches[0]})
+        # else:
+        #     return [], name_class
+    taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type)
+    if (
+        not taxa
+        and opts["taxon-lookup"] == "any"
+        and name_class not in {"any", "spellcheck"}
+    ):
         taxa, name_class = lookup_taxon(
-            es, name, opts, rank=rank, name_class="any", return_type=return_type
+            es,
+            name,
+            opts,
+            rank=rank,
+            name_class="any",
+            return_type=return_type,
+            spellings=spellings,
+        )
+    if (
+        not taxa
+        and "taxon-spellcheck" in opts
+        and opts["taxon-spellcheck"]
+        and name_class != "spellcheck"
+    ):
+        taxa, name_class = lookup_taxon(
+            es,
+            name,
+            opts,
+            rank=rank,
+            name_class="spellcheck",
+            return_type=return_type,
+            spellings=spellings,
         )
     return taxa, name_class
 
@@ -533,8 +635,8 @@ def add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon, *, blanks={"NA", "
     return new_taxon
 
 
-def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"])):
-    """Create new taxa using alternate taxon IDs."""
+def set_ranks(taxonomy):
+    """Set ranks for species/subspecies creation."""
     default_ranks = [
         "genus",
         "family",
@@ -543,6 +645,20 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
         "subphylum",
         "phylum",
     ]
+    if "subspecies" in taxonomy:
+        ranks = ["species"] + default_ranks
+    else:
+        ranks = default_ranks
+    return ranks
+
+
+def create_taxa(
+    es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None
+):
+    """Create new taxa using alternate taxon IDs."""
+    if spellings is None:
+        spellings = {}
+
     ancestors = {}
     matches = defaultdict(dict)
     pbar = tqdm(total=len(data.keys()))
@@ -556,15 +672,14 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
         lineage = []
         closest_rank = None
         closest_taxon = None
-        if "subspecies" in obj["taxonomy"]:
-            ranks = ["species"] + default_ranks
-        else:
-            ranks = default_ranks
+        ranks = set_ranks(obj["taxonomy"])
         max_index = len(ranks) - 1
         # max_rank = ranks[max_index]
         for index, rank in enumerate(ranks[: (max_index - 1)]):
             if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
                 continue
+            if obj["taxonomy"][rank] in spellings:
+                break
             intermediates = 0
             for anc_rank in ranks[(index + 1) :]:
                 if (