From 2025061d04d03e981bf416afd283d74563cf262b Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 22 Mar 2021 14:01:04 +0000 Subject: [PATCH] Check spelling when indexing Fixes #58 --- src/genomehubs/lib/fill.py | 2 - src/genomehubs/lib/hub.py | 49 ++++++++++- src/genomehubs/lib/index.py | 10 ++- src/genomehubs/lib/taxon.py | 161 ++++++++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 28 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index c9f9d416..01ce066d 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -430,8 +430,6 @@ def copy_attribute_summary(source, meta): try: dest["%s_value" % meta["type"]] = source["%s_value" % meta["type"]] except KeyError as err: - print(source) - print(meta) raise (err) dest["count"] = source["count"] dest["key"] = source["key"] diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 64831ed7..22cff7b2 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -562,6 +562,53 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"): for row in rows: data.append(row) LOGGER.info( - "Writing %d records to %s file '%s", len(data) - header_len, label, outfile + "Writing %d records to %s file '%s'", len(data) - header_len, label, outfile ) tofile.write_file(outfile, data) + + +def write_spellchecked_taxa(spellings, opts, *, types, header=None): + """Write spellchecked taxa to file.""" + imported = [] + exceptions = [] + file_key = "%s-exception" % opts["index"] + dir_key = "%s-dir" % opts["index"] + filepath = Path(types["file"]["name"]) + extensions = "".join(filepath.suffixes) + file_basename = str(filepath).replace(extensions, "") + for name, matches in spellings.items(): + # enable test condition below if importing spellchecked taxa: + # if len(matches) == 1: + # imported.append([name, matches[0]]) + # else: + exceptions.append([name] + matches) + if imported: + label = "imported" + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + LOGGER.info( + "Writing %d spelling corrections to %s file '%s'", + len(imported), + label, + outfile, + ) + tofile.write_file(outfile, [["input", "corrected"]] + imported) + if exceptions: + label = "exceptions" + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + LOGGER.info( + "Writing %d spelling suggestions to %s file '%s'", + len(exceptions), + label, + outfile, + ) + tofile.write_file(outfile, [["input", "suggested"]] + exceptions) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 5db8e5c9..32ccea37 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -9,7 +9,8 @@ [--es-host URL...] [--assembly-dir PATH] [--assembly-repo URL] [--assembly-exception PATH] [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH] - [--taxon-lookup STRING] [--file PATH...] [file-dir PATH...] + [--taxon-lookup STRING] [--taxon-spellcheck] + [--file PATH...] [file-dir PATH...] [--remote-file URL...] [--remote-file-dir URL...] [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING] [--file-title STRING] [--file-description STRING] [--file-metadata PATH] @@ -26,7 +27,8 @@ --assembly-repo URL Remote git repository containing assembly-level data. Optionally include `~branch-name` suffix. --assembly-exception PATH Path to directory to write assembly data that failed to import. - --taxon-lookup STRING Taxon name class to lookup (scientific|all). [Default: scientific] + --taxon-lookup STRING Taxon name class to lookup (scientific|any). [Default: scientific] + --taxon-spellcheck Flag to use fuzzy matching to match taxon names. --taxon-dir PATH Path to directory containing taxon-level data. --taxon-repo URL Remote git repository containing taxon-level data. Optionally include `~branch-name` suffix. @@ -72,6 +74,7 @@ from .hub import set_column_indices from .hub import validate_types_file from .hub import write_imported_rows +from .hub import write_spellchecked_taxa from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids from .version import __version__ @@ -162,6 +165,7 @@ def index_file(es, types, data, opts): else: failed_rows["None"].append(row) LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys())) + spellings = {} create_ids, without_ids = fix_missing_ids( es, opts, @@ -173,7 +177,9 @@ def index_file(es, types, data, opts): with_ids=with_ids, blanks=blanks, header=header, + spellings=spellings, ) + write_spellchecked_taxa(spellings, opts, types=types, header=header) if with_ids or create_ids: write_imported_rows( imported_rows, opts, types=types, header=header, label="imported" diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 4302c7c7..372eadfe 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -76,11 +76,13 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"): def lookup_missing_taxon_ids( - es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]) + es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None ): """Lookup taxon ID based on available taxonomic information.""" if with_ids is None: with_ids = {} + if spellings is None: + spellings = {} # TODO: set this list from types file ranks = [ "subspecies", @@ -103,7 +105,7 @@ def lookup_missing_taxon_ids( if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue taxon_ids, name_class = lookup_taxon( - es, obj["taxonomy"][rank], opts, rank=rank + es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings ) if index == 1 and not taxon_ids: break @@ -180,15 +182,18 @@ def fix_missing_ids( with_ids=None, blanks=set(["NA", "None"]), header=None, + spellings=None, ): """Find or create taxon IDs for rows without.""" if with_ids is None: with_ids = {} + if spellings is None: + spellings = {} if without_ids: # TODO: support multiple taxonomies LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys())) with_ids, without_ids, found_ids = lookup_missing_taxon_ids( - es, without_ids, opts, with_ids=with_ids, blanks=blanks + es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings ) # create new taxon IDs if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: @@ -202,6 +207,7 @@ def fix_missing_ids( data=without_ids, blanks=blanks, taxon_template=taxon_template, + spellings=spellings, ) for created_id in created_ids: if created_id in without_ids: @@ -426,19 +432,53 @@ def lookup_taxon_within_lineage( return [] -def lookup_taxon( - es, name, opts, *, rank=None, name_class="scientific", return_type="taxon_id" -): - """Lookup taxon ID.""" - taxa = [] - template = index_template(opts["taxonomy-source"][0], opts) - body = { - "id": "taxon_by_name", - "params": {"taxon": name, "rank": rank}, +def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, return_type): + """Look up taxon name with fuzzy matching.""" + taxon_suggest = { + "id": "taxon_suggest", + "params": {"searchTerm": name, "max_errors": 3}, } - if name_class == "any": - body.update({"id": "taxon_by_any_name"}) - index = template["index_name"] + matches = None + with tolog.DisableLogger(): + suggestions = es.search_template( + body=taxon_suggest, index=index, rest_total_hits_as_int=True + ) + try: + options = suggestions["suggest"]["simple_phrase"][0]["options"] + matches = [ + option["text"] + for option in options + if option.get("collate_match", False) + ] + except KeyError: + return None + except ValueError: + return None + if matches and len(matches) > 1: + taxon_matches = {} + scientific_name = None + for match in matches: + body = { + "id": "taxon_by_any_name", + "params": {"taxon": match, "rank": rank}, + } + taxa = taxon_lookup( + es, body, index, taxonomy_index_template, opts, return_type="taxon" + ) + if len(taxa) > 1: + return matches + for taxon in taxa: + source = taxon["_source"] + taxon_matches[source["taxon_id"]] = source["scientific_name"] + scientific_name = source["scientific_name"] + if len(taxon_matches.keys()) == 1: + return [scientific_name] + return matches + + +def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type): + """Query elasticsearch for a taxon.""" + taxa = [] with tolog.DisableLogger(): res = es.search_template(body=body, index=index, rest_total_hits_as_int=True) if "hits" in res and res["hits"]["total"] > 0: @@ -458,9 +498,71 @@ def lookup_taxon( taxa = [hit["_source"]["taxon_id"] for hit in res["hits"]["hits"]] else: taxa = [hit for hit in res["hits"]["hits"]] - if not taxa and opts["taxon-lookup"] == "any" and name_class != "any": + return taxa + + +def lookup_taxon( + es, + name, + opts, + *, + rank=None, + name_class="scientific", + return_type="taxon_id", + spellings=None, +): + """Lookup taxon ID.""" + if spellings is None: + spellings = {} + template = index_template(opts["taxonomy-source"][0], opts) + index = template["index_name"] + body = { + "id": "taxon_by_name", + "params": {"taxon": name, "rank": rank}, + } + if name_class in {"any", "spellcheck"}: + body.update({"id": "taxon_by_any_name"}) + if name_class == "spellcheck": + matches = spellcheck_taxon( + es, name, index, rank, taxonomy_index_template, opts, return_type + ) + if matches: + spellings.update({name: matches}) + return [], name_class + # Uncomment code blow to use suggestion in current import + # if matches and len(matches) == 1: + # body["params"].update({"taxon": matches[0]}) + # else: + # return [], name_class + taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type) + if ( + not taxa + and opts["taxon-lookup"] == "any" + and name_class not in {"any", "spellcheck"} + ): taxa, name_class = lookup_taxon( - es, name, opts, rank=rank, name_class="any", return_type=return_type + es, + name, + opts, + rank=rank, + name_class="any", + return_type=return_type, + spellings=spellings, + ) + if ( + not taxa + and "taxon-spellcheck" in opts + and opts["taxon-spellcheck"] + and name_class != "spellcheck" + ): + taxa, name_class = lookup_taxon( + es, + name, + opts, + rank=rank, + name_class="spellcheck", + return_type=return_type, + spellings=spellings, ) return taxa, name_class @@ -533,8 +635,8 @@ def add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon, *, blanks={"NA", " return new_taxon -def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"])): - """Create new taxa using alternate taxon IDs.""" +def set_ranks(taxonomy): + """Set ranks for species/subspecies creation.""" default_ranks = [ "genus", "family", @@ -543,6 +645,20 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None" "subphylum", "phylum", ] + if "subspecies" in taxonomy: + ranks = ["species"] + default_ranks + else: + ranks = default_ranks + return ranks + + +def create_taxa( + es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None +): + """Create new taxa using alternate taxon IDs.""" + if spellings is None: + spellings = {} + ancestors = {} matches = defaultdict(dict) pbar = tqdm(total=len(data.keys())) @@ -556,15 +672,14 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None" lineage = [] closest_rank = None closest_taxon = None - if "subspecies" in obj["taxonomy"]: - ranks = ["species"] + default_ranks - else: - ranks = default_ranks + ranks = set_ranks(obj["taxonomy"]) max_index = len(ranks) - 1 # max_rank = ranks[max_index] for index, rank in enumerate(ranks[: (max_index - 1)]): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue + if obj["taxonomy"][rank] in spellings: + break intermediates = 0 for anc_rank in ranks[(index + 1) :]: if (