Skip to content

Commit

Permalink
Check spelling when indexing
Browse files Browse the repository at this point in the history
Fixes #58
  • Loading branch information
rjchallis committed Mar 22, 2021
1 parent d2d91af commit 2025061
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 28 deletions.
2 changes: 0 additions & 2 deletions src/genomehubs/lib/fill.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,6 @@ def copy_attribute_summary(source, meta):
try:
dest["%s_value" % meta["type"]] = source["%s_value" % meta["type"]]
except KeyError as err:
print(source)
print(meta)
raise (err)
dest["count"] = source["count"]
dest["key"] = source["key"]
Expand Down
49 changes: 48 additions & 1 deletion src/genomehubs/lib/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,53 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
for row in rows:
data.append(row)
LOGGER.info(
"Writing %d records to %s file '%s", len(data) - header_len, label, outfile
"Writing %d records to %s file '%s'", len(data) - header_len, label, outfile
)
tofile.write_file(outfile, data)


def write_spellchecked_taxa(spellings, opts, *, types, header=None):
"""Write spellchecked taxa to file."""
imported = []
exceptions = []
file_key = "%s-exception" % opts["index"]
dir_key = "%s-dir" % opts["index"]
filepath = Path(types["file"]["name"])
extensions = "".join(filepath.suffixes)
file_basename = str(filepath).replace(extensions, "")
for name, matches in spellings.items():
# enable test condition below if importing spellchecked taxa:
# if len(matches) == 1:
# imported.append([name, matches[0]])
# else:
exceptions.append([name] + matches)
if imported:
label = "imported"
if file_key in opts and opts[file_key]:
outdir = opts[file_key]
else:
outdir = "%s/%s" % (opts[dir_key], label)
os.makedirs(outdir, exist_ok=True)
outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
LOGGER.info(
"Writing %d spelling corrections to %s file '%s'",
len(imported),
label,
outfile,
)
tofile.write_file(outfile, [["input", "corrected"]] + imported)
if exceptions:
label = "exceptions"
if file_key in opts and opts[file_key]:
outdir = opts[file_key]
else:
outdir = "%s/%s" % (opts[dir_key], label)
os.makedirs(outdir, exist_ok=True)
outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename)
LOGGER.info(
"Writing %d spelling suggestions to %s file '%s'",
len(exceptions),
label,
outfile,
)
tofile.write_file(outfile, [["input", "suggested"]] + exceptions)
10 changes: 8 additions & 2 deletions src/genomehubs/lib/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
[--es-host URL...] [--assembly-dir PATH]
[--assembly-repo URL] [--assembly-exception PATH]
[--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH]
[--taxon-lookup STRING] [--file PATH...] [file-dir PATH...]
[--taxon-lookup STRING] [--taxon-spellcheck]
[--file PATH...] [file-dir PATH...]
[--remote-file URL...] [--remote-file-dir URL...]
[--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING]
[--file-title STRING] [--file-description STRING] [--file-metadata PATH]
Expand All @@ -26,7 +27,8 @@
--assembly-repo URL Remote git repository containing assembly-level data.
Optionally include `~branch-name` suffix.
--assembly-exception PATH Path to directory to write assembly data that failed to import.
--taxon-lookup STRING Taxon name class to lookup (scientific|all). [Default: scientific]
--taxon-lookup STRING Taxon name class to lookup (scientific|any). [Default: scientific]
--taxon-spellcheck Flag to use fuzzy matching to match taxon names.
--taxon-dir PATH Path to directory containing taxon-level data.
--taxon-repo URL Remote git repository containing taxon-level data.
Optionally include `~branch-name` suffix.
Expand Down Expand Up @@ -72,6 +74,7 @@
from .hub import set_column_indices
from .hub import validate_types_file
from .hub import write_imported_rows
from .hub import write_spellchecked_taxa
from .taxon import add_names_and_attributes_to_taxa
from .taxon import fix_missing_ids
from .version import __version__
Expand Down Expand Up @@ -162,6 +165,7 @@ def index_file(es, types, data, opts):
else:
failed_rows["None"].append(row)
LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys()))
spellings = {}
create_ids, without_ids = fix_missing_ids(
es,
opts,
Expand All @@ -173,7 +177,9 @@ def index_file(es, types, data, opts):
with_ids=with_ids,
blanks=blanks,
header=header,
spellings=spellings,
)
write_spellchecked_taxa(spellings, opts, types=types, header=header)
if with_ids or create_ids:
write_imported_rows(
imported_rows, opts, types=types, header=header, label="imported"
Expand Down
161 changes: 138 additions & 23 deletions src/genomehubs/lib/taxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,13 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"):


def lookup_missing_taxon_ids(
es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"])
es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None
):
"""Lookup taxon ID based on available taxonomic information."""
if with_ids is None:
with_ids = {}
if spellings is None:
spellings = {}
# TODO: set this list from types file
ranks = [
"subspecies",
Expand All @@ -103,7 +105,7 @@ def lookup_missing_taxon_ids(
if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
continue
taxon_ids, name_class = lookup_taxon(
es, obj["taxonomy"][rank], opts, rank=rank
es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings
)
if index == 1 and not taxon_ids:
break
Expand Down Expand Up @@ -180,15 +182,18 @@ def fix_missing_ids(
with_ids=None,
blanks=set(["NA", "None"]),
header=None,
spellings=None,
):
"""Find or create taxon IDs for rows without."""
if with_ids is None:
with_ids = {}
if spellings is None:
spellings = {}
if without_ids:
# TODO: support multiple taxonomies
LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys()))
with_ids, without_ids, found_ids = lookup_missing_taxon_ids(
es, without_ids, opts, with_ids=with_ids, blanks=blanks
es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings
)
# create new taxon IDs
if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
Expand All @@ -202,6 +207,7 @@ def fix_missing_ids(
data=without_ids,
blanks=blanks,
taxon_template=taxon_template,
spellings=spellings,
)
for created_id in created_ids:
if created_id in without_ids:
Expand Down Expand Up @@ -426,19 +432,53 @@ def lookup_taxon_within_lineage(
return []


def lookup_taxon(
es, name, opts, *, rank=None, name_class="scientific", return_type="taxon_id"
):
"""Lookup taxon ID."""
taxa = []
template = index_template(opts["taxonomy-source"][0], opts)
body = {
"id": "taxon_by_name",
"params": {"taxon": name, "rank": rank},
def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, return_type):
"""Look up taxon name with fuzzy matching."""
taxon_suggest = {
"id": "taxon_suggest",
"params": {"searchTerm": name, "max_errors": 3},
}
if name_class == "any":
body.update({"id": "taxon_by_any_name"})
index = template["index_name"]
matches = None
with tolog.DisableLogger():
suggestions = es.search_template(
body=taxon_suggest, index=index, rest_total_hits_as_int=True
)
try:
options = suggestions["suggest"]["simple_phrase"][0]["options"]
matches = [
option["text"]
for option in options
if option.get("collate_match", False)
]
except KeyError:
return None
except ValueError:
return None
if matches and len(matches) > 1:
taxon_matches = {}
scientific_name = None
for match in matches:
body = {
"id": "taxon_by_any_name",
"params": {"taxon": match, "rank": rank},
}
taxa = taxon_lookup(
es, body, index, taxonomy_index_template, opts, return_type="taxon"
)
if len(taxa) > 1:
return matches
for taxon in taxa:
source = taxon["_source"]
taxon_matches[source["taxon_id"]] = source["scientific_name"]
scientific_name = source["scientific_name"]
if len(taxon_matches.keys()) == 1:
return [scientific_name]
return matches


def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type):
"""Query elasticsearch for a taxon."""
taxa = []
with tolog.DisableLogger():
res = es.search_template(body=body, index=index, rest_total_hits_as_int=True)
if "hits" in res and res["hits"]["total"] > 0:
Expand All @@ -458,9 +498,71 @@ def lookup_taxon(
taxa = [hit["_source"]["taxon_id"] for hit in res["hits"]["hits"]]
else:
taxa = [hit for hit in res["hits"]["hits"]]
if not taxa and opts["taxon-lookup"] == "any" and name_class != "any":
return taxa


def lookup_taxon(
es,
name,
opts,
*,
rank=None,
name_class="scientific",
return_type="taxon_id",
spellings=None,
):
"""Lookup taxon ID."""
if spellings is None:
spellings = {}
template = index_template(opts["taxonomy-source"][0], opts)
index = template["index_name"]
body = {
"id": "taxon_by_name",
"params": {"taxon": name, "rank": rank},
}
if name_class in {"any", "spellcheck"}:
body.update({"id": "taxon_by_any_name"})
if name_class == "spellcheck":
matches = spellcheck_taxon(
es, name, index, rank, taxonomy_index_template, opts, return_type
)
if matches:
spellings.update({name: matches})
return [], name_class
# Uncomment code blow to use suggestion in current import
# if matches and len(matches) == 1:
# body["params"].update({"taxon": matches[0]})
# else:
# return [], name_class
taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type)
if (
not taxa
and opts["taxon-lookup"] == "any"
and name_class not in {"any", "spellcheck"}
):
taxa, name_class = lookup_taxon(
es, name, opts, rank=rank, name_class="any", return_type=return_type
es,
name,
opts,
rank=rank,
name_class="any",
return_type=return_type,
spellings=spellings,
)
if (
not taxa
and "taxon-spellcheck" in opts
and opts["taxon-spellcheck"]
and name_class != "spellcheck"
):
taxa, name_class = lookup_taxon(
es,
name,
opts,
rank=rank,
name_class="spellcheck",
return_type=return_type,
spellings=spellings,
)
return taxa, name_class

Expand Down Expand Up @@ -533,8 +635,8 @@ def add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon, *, blanks={"NA", "
return new_taxon


def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"])):
"""Create new taxa using alternate taxon IDs."""
def set_ranks(taxonomy):
"""Set ranks for species/subspecies creation."""
default_ranks = [
"genus",
"family",
Expand All @@ -543,6 +645,20 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
"subphylum",
"phylum",
]
if "subspecies" in taxonomy:
ranks = ["species"] + default_ranks
else:
ranks = default_ranks
return ranks


def create_taxa(
es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None
):
"""Create new taxa using alternate taxon IDs."""
if spellings is None:
spellings = {}

ancestors = {}
matches = defaultdict(dict)
pbar = tqdm(total=len(data.keys()))
Expand All @@ -556,15 +672,14 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"
lineage = []
closest_rank = None
closest_taxon = None
if "subspecies" in obj["taxonomy"]:
ranks = ["species"] + default_ranks
else:
ranks = default_ranks
ranks = set_ranks(obj["taxonomy"])
max_index = len(ranks) - 1
# max_rank = ranks[max_index]
for index, rank in enumerate(ranks[: (max_index - 1)]):
if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks:
continue
if obj["taxonomy"][rank] in spellings:
break
intermediates = 0
for anc_rank in ranks[(index + 1) :]:
if (
Expand Down

0 comments on commit 2025061

Please sign in to comment.