From 4576702bf101a88d78a2f67cfcaf79dfd12935fb Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 3 Mar 2021 09:13:47 +0000 Subject: [PATCH 01/21] Write successful imports to file Fixes #45 --- src/genomehubs/lib/hub.py | 28 ++++++++++++++++++++++++++++ src/genomehubs/lib/index.py | 13 +++++++++++-- src/genomehubs/lib/taxon.py | 24 +++++------------------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 6da6d4d4..e3f1315a 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -503,3 +503,31 @@ def set_column_indices(types, header): index = headers.get(value["header"], None) if index is not None: value.update({"index": index}) + + +def write_imported_rows(rows, opts, *, types, header=None, label="imported"): + """Write imported rows to processed file.""" + file_key = "%s-exception" % opts["index"] + dir_key = "%s-dir" % opts["index"] + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, types["file"]["name"]) + data = [] + header_len = 0 + if header is not None: + data.append(header) + header_len = 1 + if isinstance(rows, dict): + for row_set in rows.values(): + for row in row_set: + data.append(row) + else: + for row in rows: + data.append(row) + LOGGER.info( + "Writing %d records to %s file '%s", len(data) - header_len, label, outfile + ) + tofile.write_file(outfile, data) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 5a1c0445..344c5e0b 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -71,6 +71,7 @@ from .hub import process_row from .hub import set_column_indices from .hub import validate_types_file +from .hub import write_imported_rows from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids from .version import __version__ @@ -84,14 +85,16 @@ def index_file(es, types, data, opts): rows = csv.reader( data, delimiter=delimiters[types["file"]["format"]], quotechar='"' ) - header = None - if types["file"].get("header", False): + if "header" in types["file"] and types["file"]["header"]: header = next(rows) set_column_indices(types, header) + else: + header = None with_ids = defaultdict(list) taxon_asm_data = defaultdict(list) without_ids = defaultdict(list) failed_rows = defaultdict(list) + imported_rows = [] blanks = set(["", "NA", "N/A", "None"]) taxon_types = {} for taxonomy_name in opts["taxonomy-source"]: @@ -112,6 +115,7 @@ def index_file(es, types, data, opts): taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append( taxon_data ) + imported_rows.append(row) else: if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append( @@ -147,11 +151,15 @@ def index_file(es, types, data, opts): types=types, taxon_template=taxon_template, failed_rows=failed_rows, + imported_rows=imported_rows, with_ids=with_ids, blanks=blanks, header=header, ) if with_ids or create_ids: + write_imported_rows( + imported_rows, opts, types=types, header=header, label="imported" + ) LOGGER.info("Indexing %d entries", len(with_ids.keys())) if opts["index"] == "taxon": docs = add_names_and_attributes_to_taxa( @@ -164,6 +172,7 @@ def index_file(es, types, data, opts): _op_type="update", ) elif opts["index"] == "assembly": + # TODO: keep track of taxon_id not found exceptions assembly_template = assembly.index_template(taxonomy_name, opts) docs = add_identifiers_and_attributes_to_assemblies( es, diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 19187de3..c60e7dd6 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -2,11 +2,9 @@ """Taxon methods.""" -import os import sys from collections import defaultdict -from tolkein import tofile from tolkein import tolog from tqdm import tqdm @@ -17,6 +15,7 @@ from .hub import add_attribute_values from .hub import chunks from .hub import index_templator +from .hub import write_imported_rows from .taxonomy import index_template as taxonomy_index_template LOGGER = tolog.logger(__name__) @@ -177,6 +176,7 @@ def fix_missing_ids( types, taxon_template, failed_rows, + imported_rows, with_ids=None, blanks=set(["NA", "None"]), header=None, @@ -211,29 +211,15 @@ def fix_missing_ids( if without_ids and failed_rows: for key, value in found_ids.items(): if key in failed_rows: + imported_rows += failed_rows[key] del failed_rows[key] if failed_rows: LOGGER.info( "Unable to associate %d records with taxon IDs", len(failed_rows) ) - data = [] - exception_key = "%s-exception" % opts["index"] - dir_key = "%s-dir" % opts["index"] - if exception_key in opts and opts[exception_key]: - outdir = opts[exception_key] - else: - outdir = "%s/exceptions" % opts[dir_key] - os.makedirs(outdir, exist_ok=True) - outfile = "%s/%s" % (outdir, types["file"]["name"]) - if header: - data.append(header) - for rows in failed_rows.values(): - for row in rows: - data.append(row) - LOGGER.info( - "Writing %d records to exceptions file '%s", len(data) - 1, outfile + write_imported_rows( + failed_rows, opts, types=types, header=header, label="exceptions" ) - tofile.write_file(outfile, data) return with_ids, without_ids From 06011697fbc165dff25cb125233b52d44a7bac15 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 3 Mar 2021 09:16:04 +0000 Subject: [PATCH 02/21] remove v from version number --- .bumpversion.cfg | 7 ++++--- README.rst | 4 ++-- conda-recipe/meta.yaml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 4505724e..e23c4429 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -2,7 +2,8 @@ current_version = 2.0.5 commit = True tag = True -message = 'Bump version: {current_version} → {new_version}' +tag_name = {new_version} +message = "Bump version: {current_version} → {new_version}" [bumpversion:file:setup.py] search = version="{current_version}" @@ -13,8 +14,8 @@ search = version = "{current_version}" replace = version = "{new_version}" [bumpversion:file:README.rst] -search = v{current_version}. -replace = v{new_version}. +search = {current_version}. +replace = {new_version}. [bumpversion:file:docs/conf.py] search = version = release = "{current_version}" diff --git a/README.rst b/README.rst index e2e31325..6ad93b1b 100644 --- a/README.rst +++ b/README.rst @@ -42,9 +42,9 @@ GenomeHubs :alt: Conda platforms :target: https://anaconda.org/tolkit/genomehubs -.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/v2.0.5.svg +.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.5.svg :alt: Commits since latest release - :target: https://github.com/genomehubs/genomehubs/compare/v2.0.5...main + :target: https://github.com/genomehubs/genomehubs/compare/2.0.5...main .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg :alt: MIT License diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index c5427238..ca587a7a 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -6,7 +6,7 @@ package: version: {{ version }} source: - git_rev: v{{ version }} + git_rev: {{ version }} git_url: https://github.com/genomehubs/genomehubs.git build: From c6c9c04be1b2feeec94a0620397547478b8c76d5 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 3 Mar 2021 09:17:52 +0000 Subject: [PATCH 03/21] =?UTF-8?q?"Bump=20version:=202.0.5=20=E2=86=92=202.?= =?UTF-8?q?0.6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- README.rst | 4 ++-- conda-recipe/meta.yaml | 2 +- docs/conf.py | 2 +- scripts/conda_build.sh | 2 +- setup.py | 2 +- src/genomehubs/lib/version.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e23c4429..6b7aaee3 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.5 +current_version = 2.0.6 commit = True tag = True tag_name = {new_version} diff --git a/README.rst b/README.rst index 6ad93b1b..c8796d7e 100644 --- a/README.rst +++ b/README.rst @@ -42,9 +42,9 @@ GenomeHubs :alt: Conda platforms :target: https://anaconda.org/tolkit/genomehubs -.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.5.svg +.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.6.svg :alt: Commits since latest release - :target: https://github.com/genomehubs/genomehubs/compare/2.0.5...main + :target: https://github.com/genomehubs/genomehubs/compare/2.0.6...main .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg :alt: MIT License diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index ca587a7a..06d22bc5 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "genomehubs" %} -{% set version = "2.0.5" %} +{% set version = "2.0.6" %} package: name: {{ name }} diff --git a/docs/conf.py b/docs/conf.py index 014df512..b6dac8a1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ version = release = get_distribution("genomehubs").version except Exception: traceback.print_exc() - version = release = "2.0.5" + version = release = "2.0.6" pygments_style = "trac" templates_path = ["."] diff --git a/scripts/conda_build.sh b/scripts/conda_build.sh index 59173100..79064de9 100755 --- a/scripts/conda_build.sh +++ b/scripts/conda_build.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -VERSION=2.0.5 +VERSION=2.0.6 case $(uname | tr '[:upper:]' '[:lower:]') in linux*) diff --git a/setup.py b/setup.py index f0952bee..ab311ee9 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(*names, **kwargs): setup( name="genomehubs", # Required - version="2.0.5", + version="2.0.6", description="GenomeHubs", # Optional long_description="%s\n%s" % ( diff --git a/src/genomehubs/lib/version.py b/src/genomehubs/lib/version.py index 5edc0d68..bab7ca25 100644 --- a/src/genomehubs/lib/version.py +++ b/src/genomehubs/lib/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python3 """genomehubs version.""" -__version__ = "2.0.5" +__version__ = "2.0.6" From 0bd67ac114a74452e408aa8b4332313c342e7a77 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 3 Mar 2021 09:20:16 +0000 Subject: [PATCH 04/21] =?UTF-8?q?"Bump=20version:=202.0.6=20=E2=86=92=202.?= =?UTF-8?q?0.7"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- README.rst | 4 ++-- conda-recipe/meta.yaml | 2 +- docs/conf.py | 2 +- scripts/conda_build.sh | 2 +- setup.py | 2 +- src/genomehubs/lib/version.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 6b7aaee3..fcd1fde8 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.6 +current_version = 2.0.7 commit = True tag = True tag_name = {new_version} diff --git a/README.rst b/README.rst index c8796d7e..e48cb478 100644 --- a/README.rst +++ b/README.rst @@ -42,9 +42,9 @@ GenomeHubs :alt: Conda platforms :target: https://anaconda.org/tolkit/genomehubs -.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.6.svg +.. |commits-since| image:: https://img.shields.io/github/commits-since/genomehubs/genomehubs/2.0.7.svg :alt: Commits since latest release - :target: https://github.com/genomehubs/genomehubs/compare/2.0.6...main + :target: https://github.com/genomehubs/genomehubs/compare/2.0.7...main .. |license| image:: https://anaconda.org/tolkit/genomehubs/badges/license.svg :alt: MIT License diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 06d22bc5..f8add197 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "genomehubs" %} -{% set version = "2.0.6" %} +{% set version = "2.0.7" %} package: name: {{ name }} diff --git a/docs/conf.py b/docs/conf.py index b6dac8a1..1a6b55b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ version = release = get_distribution("genomehubs").version except Exception: traceback.print_exc() - version = release = "2.0.6" + version = release = "2.0.7" pygments_style = "trac" templates_path = ["."] diff --git a/scripts/conda_build.sh b/scripts/conda_build.sh index 79064de9..45afbb26 100755 --- a/scripts/conda_build.sh +++ b/scripts/conda_build.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -VERSION=2.0.6 +VERSION=2.0.7 case $(uname | tr '[:upper:]' '[:lower:]') in linux*) diff --git a/setup.py b/setup.py index ab311ee9..cd1829ed 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(*names, **kwargs): setup( name="genomehubs", # Required - version="2.0.6", + version="2.0.7", description="GenomeHubs", # Optional long_description="%s\n%s" % ( diff --git a/src/genomehubs/lib/version.py b/src/genomehubs/lib/version.py index bab7ca25..0e87ce51 100644 --- a/src/genomehubs/lib/version.py +++ b/src/genomehubs/lib/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python3 """genomehubs version.""" -__version__ = "2.0.6" +__version__ = "2.0.7" From c83d203e9c1c2a1165d01ccfb82b9065d5433a06 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Thu, 4 Mar 2021 09:23:54 +0000 Subject: [PATCH 05/21] check for blanks when grouping taxa without IDs --- src/genomehubs/lib/hub.py | 2 ++ src/genomehubs/lib/index.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index e3f1315a..80a33e4e 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -454,6 +454,8 @@ def process_row(types, row): for group in data.keys(): if group in types: for key, meta in types[group].items(): + if "index" not in meta: + continue try: if isinstance(meta["index"], list): char = meta.get("join", "") diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 344c5e0b..75e32426 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -79,6 +79,11 @@ LOGGER = tolog.logger(__name__) +def not_blank(key, obj, blanks): + """Test value is not blank.""" + return key in obj and obj[key] and obj[key] not in blanks + + def index_file(es, types, data, opts): """Index a file.""" delimiters = {"csv": ",", "tsv": "\t"} @@ -107,17 +112,16 @@ def index_file(es, types, data, opts): failed_rows["None"].append(row) continue taxon_types.update(new_taxon_types) - if ( - "taxon_id" in processed_data["taxonomy"] - and processed_data["taxonomy"]["taxon_id"] not in blanks - ): + if not_blank("taxon_id", processed_data["taxonomy"], blanks): with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data) taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append( taxon_data ) imported_rows.append(row) else: - if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: + if "taxonomy" in types and not_blank( + "alt_taxon_id", processed_data["taxonomy"], blanks + ): without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append( processed_data ) @@ -125,7 +129,7 @@ def index_file(es, types, data, opts): taxon_data ) failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row) - elif "subspecies" in processed_data["taxonomy"]: + elif not_blank("subspecies", processed_data["taxonomy"], blanks): without_ids[processed_data["taxonomy"]["subspecies"]].append( processed_data ) @@ -133,7 +137,7 @@ def index_file(es, types, data, opts): taxon_data ) failed_rows[processed_data["taxonomy"]["subspecies"]].append(row) - elif "species" in processed_data["taxonomy"]: + elif not_blank("species", processed_data["taxonomy"], blanks): without_ids[processed_data["taxonomy"]["species"]].append( processed_data ) From 638dd3226d62638fc8adbc2c3a6a82e485eb4a65 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 9 Mar 2021 09:35:51 +0000 Subject: [PATCH 06/21] keep descendant nodes in memory during fill Fixes #50 --- src/genomehubs/lib/fill.py | 122 +++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 24 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index 023c154e..4c25bc4d 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -59,11 +59,6 @@ from .es_functions import stream_template_search_results from .version import __version__ -# if platform.system() != "Linux": -# from multiprocessing import set_start_method - -# set_start_method("fork") - LOGGER = tolog.logger(__name__) @@ -201,8 +196,10 @@ def summarise_attribute_values( def summarise_attributes(*, attributes, attrs, meta, parent, parents): """Set attribute summary values.""" changed = False + attr_dict = {} for node_attribute in attributes: if node_attribute["key"] in attrs: + attr_dict[node_attribute["key"]] = node_attribute summary_value, max_value, min_value = summarise_attribute_values( node_attribute, meta[node_attribute["key"]] ) @@ -225,14 +222,25 @@ def summarise_attributes(*, attributes, attrs, meta, parent, parents): parents[parent][node_attribute["key"]]["min"] = min( parents[parent][node_attribute["key"]]["min"], min_value ) - return changed + return changed, attr_dict def set_values_from_descendants( - *, attributes, descendant_values, meta, parent, taxon_rank, parents + *, + attributes, + descendant_values, + meta, + taxon_id, + parent, + taxon_rank, + parents, + attr_dict=None, + limits=None ): """Set attribute summary values from descendant values.""" changed = False + if attr_dict is None: + attr_dict = {} for key, obj in descendant_values.items(): traverseable = meta[key].get("traverse", False) if ( @@ -243,9 +251,11 @@ def set_values_from_descendants( traverseable = False if not traverseable: continue + if taxon_id in limits[key]: + continue traverse_limit = meta[key].get("traverse_limit", None) if traverse_limit and taxon_rank == traverse_limit: - continue + limits[key].add(parent) try: attribute = next(entry for entry in attributes if entry["key"] == key) except StopIteration: @@ -261,8 +271,7 @@ def set_values_from_descendants( if summary_value is not None: attribute["aggregation_source"] = "descendant" changed = True - if traverse_limit and taxon_rank == traverse_limit: - continue + attr_dict.update({key: attribute}) if parent is not None: if isinstance(summary_value, list): parents[parent][key]["values"] = list( @@ -278,7 +287,60 @@ def set_values_from_descendants( parents[parent][key]["min"] = min( parents[parent][key]["min"], min_value ) - return changed + return changed, attr_dict + + +def set_attributes_to_descend(meta): + """Set which attributes should have values inferred from ancestral taxa.""" + desc_attrs = set() + desc_attr_limits = {} + for key, value in meta.items(): + if "traverse" in value and value["traverse"]: + if "traverse_direction" not in value or value["traverse_direction"] in ( + "down", + "both", + ): + desc_attrs.add(key) + if "traverse_limit" in value: + desc_attr_limits.update({key: value["traverse_limit"]}) + return desc_attrs, desc_attr_limits + + +def track_missing_attribute_values( + node, missing_attributes, attr_dict, desc_attrs, desc_attr_limits +): + """Keep track of missing attribute values for in memory traversal.""" + missing_from_descendants = {} + if ( + node["_source"]["taxon_id"] in missing_attributes + and missing_attributes[node["_source"]["taxon_id"]] + ): + for child_id, obj in missing_attributes[node["_source"]["taxon_id"]].items(): + for key, attribute in attr_dict.items(): + if key in obj["keys"]: + # update aggregation source here + # TODO: #51 include ancestral rank in aggregation source + obj["attributes"].append( + {**attribute, "aggregation_source": "ancestor"} + ) + obj["keys"].remove(key) + if obj["keys"]: + missing_from_descendants.update({child_id: obj}) + else: + # yield when all values filled or removed + yield obj["node"]["_id"], obj["node"]["_source"] + del missing_attributes[node["_source"]["taxon_id"]] + if "parent" in node["_source"]: + missing_attributes[node["_source"]["parent"]].update(missing_from_descendants) + missing_attributes[node["_source"]["parent"]].update( + { + node["_source"]["taxon_id"]: { + "keys": set({key for key in desc_attrs if key not in attr_dict}), + "attributes": node["_source"]["attributes"], + "node": node, + } + } + ) def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): @@ -299,6 +361,12 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): lambda: {"max": float("-inf"), "min": float("inf"), "values": []} ) ) + limits = defaultdict(set) + if "traverse-infer-both" in opts and opts["traverse-infer-both"]: + desc_attrs, desc_attr_limits = set_attributes_to_descend(meta) + missing_attributes = defaultdict(dict) + else: + desc_attrs = {} while root_depth >= 0: nodes = stream_nodes_by_root_depth( es, @@ -309,32 +377,45 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): ) ctr = 0 for node in nodes: + # TODO: break into sub functions ctr += 1 changed = False + attr_dict = {} if "attributes" in node["_source"] and node["_source"]["attributes"]: - changed = summarise_attributes( + changed, attr_dict = summarise_attributes( attributes=node["_source"]["attributes"], attrs=attrs, meta=meta, parent=node["_source"].get("parent", None), parents=parents, ) + else: + node["_source"]["attributes"] = [] if node["_source"]["taxon_id"] in parents: - if "attributes" not in node["_source"]: - node["_source"]["attributes"] = [] - modified = set_values_from_descendants( + modified, attr_dict = set_values_from_descendants( attributes=node["_source"]["attributes"], descendant_values=parents[node["_source"]["taxon_id"]], meta=meta, + taxon_id=node["_source"]["taxon_id"], parent=node["_source"].get("parent", None), parents=parents, taxon_rank=node["_source"]["taxon_rank"], + attr_dict=attr_dict, + limits=limits, ) if not changed: changed = modified + if desc_attrs: + yield from track_missing_attribute_values( + node, missing_attributes, attr_dict, desc_attrs, desc_attr_limits + ) if changed: yield node["_id"], node["_source"] root_depth -= 1 + if desc_attrs: + for incomplete in missing_attributes.values(): + for obj in incomplete.values(): + yield obj["node"]["_id"], obj["node"]["_source"] def copy_attribute_summary(source, meta): @@ -399,9 +480,7 @@ def traverse_from_root(es, opts, *, template, root=None, max_depth=None, log=Tru root = opts["traverse-root"] if max_depth is None: max_depth = get_max_depth_by_lineage( - es, - index=template["index_name"], - root=root, + es, index=template["index_name"], root=root ) root_depth = max_depth - 1 meta = template["types"]["attributes"] @@ -414,11 +493,7 @@ def traverse_from_root(es, opts, *, template, root=None, max_depth=None, log=Tru if log: LOGGER.info("Filling values at root depth %d" % root_depth) nodes = stream_nodes_by_root_depth( - es, - index=template["index_name"], - root=root, - depth=root_depth, - size=50, + es, index=template["index_name"], root=root, depth=root_depth, size=50 ) desc_nodes = stream_missing_attributes_at_level( es, nodes=nodes, attrs=attrs, template=template @@ -494,7 +569,6 @@ def main(args): options = config("fill", **args) if "traverse-infer-both" in options["fill"]: options["fill"]["traverse-infer-ancestors"] = True - options["fill"]["traverse-infer-descendants"] = True # Start Elasticsearch es = launch_es(options["fill"]) From 2250481b96769245b47b90edc9778a4ae98bfbf3 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 19 Mar 2021 11:52:36 +0000 Subject: [PATCH 07/21] Load taxon names as xrefs (#55) --- src/genomehubs/lib/attributes.py | 50 +++++++++++++++-------- src/genomehubs/lib/fill.py | 1 + src/genomehubs/lib/hub.py | 22 +++++++++- src/genomehubs/lib/index.py | 3 +- src/genomehubs/lib/taxon.py | 18 ++++---- src/genomehubs/templates/attributes.json | 5 +++ src/genomehubs/templates/identifiers.json | 38 +++++++++++++++++ src/genomehubs/templates/taxon.json | 11 +++++ 8 files changed, 119 insertions(+), 29 deletions(-) create mode 100644 src/genomehubs/templates/identifiers.json diff --git a/src/genomehubs/lib/attributes.py b/src/genomehubs/lib/attributes.py index 831c49bb..aefc5e61 100644 --- a/src/genomehubs/lib/attributes.py +++ b/src/genomehubs/lib/attributes.py @@ -12,42 +12,56 @@ LOGGER = tolog.logger(__name__) -def index_template(opts): +def index_template(opts, *, index_type="attribute"): """Index template (includes name, mapping and types).""" - parts = ["attributes", opts["hub-name"], opts["hub-version"]] + parts = ["%ss" % index_type, opts["hub-name"], opts["hub-version"]] template = index_templator(parts, opts) return template -def stream_attributes(group, attributes): +def stream_attributes(group, attributes, *, index_type="attribute"): """Stream attributes for indexing.""" for name, obj in attributes.items(): ret = {"group": group, "name": name} for prop, value in obj.items(): if not prop.startswith("taxon_"): ret.update({prop: value}) - yield "attribute-%s-%s" % (group, name), ret + yield "%s-%s-%s" % (index_type, group, name), ret -def index(es, group, attributes, opts): - """Index a set of attributes.""" - LOGGER.info("Indexing attributes") - template = index_template(opts) - stream = stream_attributes(group, attributes) +def index(es, group, attributes, opts, *, index_type="attribute"): + """Index a set of attributes or names.""" + LOGGER.info("Indexing %s" % index_type) + template = index_template(opts, index_type=index_type) + stream = stream_attributes(group, attributes, index_type=index_type) return template, stream def index_types(es, types_name, types, opts): """Index types into Elasticsearch.""" - if "attributes" not in types: - return - if "defaults" in types: - for key, value in types["attributes"].items(): - value = {**types["defaults"]["attributes"], **value} - types["attributes"][key] = value - template, stream = index(es, types_name, types["attributes"], opts) - load_mapping(es, template["name"], template["mapping"]) - index_stream(es, template["index_name"], stream) + if "attributes" in types: + if "defaults" in types and "attributes" in types["defaults"]: + for key, value in types["attributes"].items(): + value = {**types["defaults"]["attributes"], **value} + types["attributes"][key] = value + template, stream = index( + es, types_name, types["attributes"], opts, index_type="attribute" + ) + load_mapping(es, template["name"], template["mapping"]) + index_stream(es, template["index_name"], stream) + if "taxon_names" in types: + if "defaults" in types and "taxon_names" in types["defaults"]: + for key, value in types["names"].items(): + value = { + **types["defaults"]["taxon_names"], + **value, + } + types["taxon_names"][key] = value + template, stream = index( + es, types_name, types["taxon_names"], opts, index_type="identifier" + ) + load_mapping(es, template["name"], template["mapping"]) + index_stream(es, template["index_name"], stream) def fetch_types(es, types_name, opts): diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index 4c25bc4d..c9f9d416 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -254,6 +254,7 @@ def set_values_from_descendants( if taxon_id in limits[key]: continue traverse_limit = meta[key].get("traverse_limit", None) + # TODO: #53 catch traverse limits when limit rank is missing if traverse_limit and taxon_rank == traverse_limit: limits[key].add(parent) try: diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 80a33e4e..2c842285 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -432,6 +432,22 @@ def validate_types_file(types_file, dir_path): return types, data +def set_xrefs(taxon_names, types, row, *, meta=None): + """Set xrefs for taxon_names.""" + if meta is None: + meta = {} + names = [] + for name_class, value in taxon_names.items(): + taxon = {"name": value, "class": name_class} + if "xref" in types[name_class] and types[name_class]["xref"]: + if "source" in meta: + taxon.update({"source": meta["source"]}) + if "source_stub" in meta: + taxon.update({"source_stub": meta["source_stub"]}) + names.append(taxon) + return names + + def process_row(types, row): """Process a row of data.""" data = { @@ -479,7 +495,7 @@ def process_row(types, row): taxon_data = {} taxon_types = {} for attr_type in list(["attributes", "identifiers"]): - if data[attr_type]: + if attr_type in data and data[attr_type]: ( data[attr_type], taxon_data[attr_type], @@ -492,6 +508,10 @@ def process_row(types, row): ) else: data[attr_type] = [] + if "taxon_names" in data and data["taxon_names"]: + data["taxon_names"] = set_xrefs( + data["taxon_names"], types["taxon_names"], row, meta=data["metadata"] + ) return data, taxon_data, taxon_types.get("attributes", {}) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 75e32426..c10266b2 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -108,7 +108,8 @@ def index_file(es, types, data, opts): for row in tqdm(rows): try: processed_data, taxon_data, new_taxon_types = process_row(types, row) - except Exception: + except Exception as err: + print(err) failed_rows["None"].append(row) continue taxon_types.update(new_taxon_types) diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index c60e7dd6..4302c7c7 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -333,15 +333,15 @@ def add_names_to_list(existing, new, *, blanks=set({"NA", "None"})): names = defaultdict(dict) for entry in existing: names[entry["class"]][entry["name"]] = True - for name_class, name in new.items(): - name_class = name_class.replace("_", " ") + for entry in new: + entry["class"] = entry["class"].lower() # .replace("_", " ") if ( - name not in blanks - and name_class not in names - and name not in names[name_class] + entry["name"] not in blanks + and entry["class"] not in names + and entry["name"] not in names[entry["class"]] ): - existing.append({"name": name, "class": name_class}) - names[name_class][name] = True + existing.append(entry) + names[entry["class"]][entry["name"]] = True def add_names_and_attributes_to_taxa( @@ -363,13 +363,13 @@ def add_names_and_attributes_to_taxa( for doc in taxa: if doc is not None: taxon_data = data[doc["_source"]["taxon_id"]] - taxon_names = {} + taxon_names = [] attributes = [] for entry in taxon_data: if "attributes" in entry: attributes = attributes + entry["attributes"] if "taxon_names" in entry: - taxon_names.update(entry["taxon_names"]) + taxon_names += entry["taxon_names"] if "taxon_names" not in doc["_source"]: doc["_source"]["taxon_names"] = [] add_names_to_list( diff --git a/src/genomehubs/templates/attributes.json b/src/genomehubs/templates/attributes.json index 8f0239e4..9bf5ba34 100644 --- a/src/genomehubs/templates/attributes.json +++ b/src/genomehubs/templates/attributes.json @@ -12,6 +12,11 @@ "ignore_above": 32, "meta": { "description": "Attribute name" } }, + "display_name": { + "type": "keyword", + "index": false, + "meta": { "description": "Attribute display name" } + }, "constraint": { "type": "object" }, diff --git a/src/genomehubs/templates/identifiers.json b/src/genomehubs/templates/identifiers.json new file mode 100644 index 00000000..392d01e0 --- /dev/null +++ b/src/genomehubs/templates/identifiers.json @@ -0,0 +1,38 @@ +{ + "index_patterns": ["identifiers-*"], + "mappings": { + "properties": { + "group": { + "type": "keyword", + "ignore_above": 16, + "meta": { "description": "Index group (e.g. assembly or taxon)" } + }, + "name_class": { + "type": "keyword", + "ignore_above": 32, + "meta": { "description": "Name class" } + }, + "display_name": { + "type": "keyword", + "index": false, + "meta": { "description": "Display name" } + }, + "display_group": { + "type": "keyword", + "ignore_above": 32, + "null_value": "names", + "meta": { "description": "Display name" } + }, + "source": { + "type": "keyword", + "ignore_above": 32, + "meta": { "description": "Source name" } + }, + "source_url_stub": { + "type": "keyword", + "index": false, + "meta": { "description": "URL stub for xref" } + } + } + } +} diff --git a/src/genomehubs/templates/taxon.json b/src/genomehubs/templates/taxon.json index dd606808..103d5a9d 100644 --- a/src/genomehubs/templates/taxon.json +++ b/src/genomehubs/templates/taxon.json @@ -80,6 +80,17 @@ "meta": { "description": "Name class (e.g. common name, synonym, etc.)" } + }, + "source": { + "type": "keyword", + "ignore_above": 64, + "normalizer": "lowercase", + "meta": { "description": "Source DB for taxon name" } + }, + "source_url_stub": { + "type": "keyword", + "index": false, + "meta": { "description": "URL slug for taxon name xref" } } } }, From 3219c70eb4105a86d4e4018d412d1ae3f37092ba Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 19 Mar 2021 15:06:30 +0000 Subject: [PATCH 08/21] parse wikidata xrefs Fixes #55 --- src/genomehubs/lib/btk.py | 2 +- src/genomehubs/lib/gbif.py | 2 +- src/genomehubs/lib/hub.py | 26 +++++-- src/genomehubs/lib/ncbi.py | 4 +- src/genomehubs/lib/parse.py | 17 ++--- src/genomehubs/lib/wikidata.py | 68 +++++++++++-------- .../{xref.names.yaml => wikidata.names.yaml} | 8 ++- src/genomehubs/templates/xref.types.yaml | 38 ----------- 8 files changed, 76 insertions(+), 89 deletions(-) rename src/genomehubs/templates/{xref.names.yaml => wikidata.names.yaml} (78%) delete mode 100644 src/genomehubs/templates/xref.types.yaml diff --git a/src/genomehubs/lib/btk.py b/src/genomehubs/lib/btk.py index c6ad1828..53a4d0eb 100644 --- a/src/genomehubs/lib/btk.py +++ b/src/genomehubs/lib/btk.py @@ -87,7 +87,7 @@ def describe_btk_files(meta): return files -def btk_parser(_params, opts): +def btk_parser(_params, opts, *args, **kwargs): """Parse BlobToolKit assemblies.""" parsed = [] analyses = [] diff --git a/src/genomehubs/lib/gbif.py b/src/genomehubs/lib/gbif.py index c379e386..b31b6652 100644 --- a/src/genomehubs/lib/gbif.py +++ b/src/genomehubs/lib/gbif.py @@ -87,7 +87,7 @@ def fetch_gbif_identifiers(taxon, *, xrefs=None): return identifiers -def gbif_parser(_params, opts): +def gbif_parser(_params, opts, *args, **kwargs): """Parse GBIF taxa and identifiers.""" parsed = [] for root in opts["gbif-root"]: diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 2c842285..64831ed7 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -63,13 +63,16 @@ def index_templator(parts, opts): return template -def order_parsed_fields(parsed, types, names=None): - """Order parsed fields using a template file.""" - columns = {} - fields = {} - ctr = 0 - types = deepcopy(types) - if names is not None: +def add_names_to_types(names, types): + """Add names field meta to type field meta.""" + sources = 0 + if types is not None: + types = deepcopy(types) + sources += 1 + elif names is not None: + types = deepcopy(names) + sources += 1 + if sources == 2: for group, entries in names.items(): if group not in types: types[group] = deepcopy(entries) @@ -80,6 +83,15 @@ def order_parsed_fields(parsed, types, names=None): types[group][field] = deepcopy(attrs) elif types[group][field]["header"] != attrs["header"]: types[group]["names_%s" % field] = deepcopy(attrs) + return types + + +def order_parsed_fields(parsed, types, names=None): + """Order parsed fields using a template file.""" + columns = {} + fields = {} + ctr = 0 + types = add_names_to_types(names, types) for group, entries in types.items(): for field, attrs in entries.items(): header = False diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py index 8ab16b1d..404409ae 100644 --- a/src/genomehubs/lib/ncbi.py +++ b/src/genomehubs/lib/ncbi.py @@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts): return parsed -def refseq_organelle_parser(collections, opts): +def refseq_organelle_parser(collections, opts, *args, **kwargs): """Fetch and parse RefSeq organelle collections.""" parsed = [] if isinstance(collections, tuple): @@ -233,7 +233,7 @@ def parse_ncbi_datasets_record(record, parsed): parsed[obj["genbankAssmAccession"]] = obj -def ncbi_genome_parser(directory, opts): +def ncbi_genome_parser(directory, opts, *args, **kwargs): """Parse NCBI Datasets genome report.""" parsed = {} with tofile.open_file_handle( diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py index cbef1219..758c2495 100644 --- a/src/genomehubs/lib/parse.py +++ b/src/genomehubs/lib/parse.py @@ -44,7 +44,6 @@ from .btk import btk_parser from .config import config -from .gbif import gbif_parser from .hub import load_types from .hub import order_parsed_fields from .ncbi import ncbi_genome_parser @@ -56,7 +55,6 @@ PARSERS = { "btk": {"func": btk_parser, "params": None, "types": "btk"}, - "gbif": {"func": gbif_parser, "params": None, "types": "xref"}, "ncbi-datasets-genome": { "func": ncbi_genome_parser, "params": None, @@ -77,7 +75,7 @@ "params": ("plastid"), "types": "organelle", }, - "wikidata": {"func": wikidata_parser, "params": None, "types": "xref"}, + "wikidata": {"func": wikidata_parser, "params": None, "types": "wikidata"}, } @@ -91,23 +89,26 @@ def main(args): if params is None: params = options["parse"][option] LOGGER.info("Parsing %s" % option) - parsed = PARSERS[option]["func"](params, options["parse"]) + types = load_types(PARSERS[option]["types"]) + names = load_types(PARSERS[option]["types"], part="names") + parsed = PARSERS[option]["func"]( + params, options["parse"], types=types, names=names + ) files = [] if isinstance(parsed, tuple): parsed, files = parsed - types = load_types(PARSERS[option]["types"]) - names = load_types(PARSERS[option]["types"], part="names") data = order_parsed_fields(parsed, types, names) tofile.write_file(options["parse"]["outfile"], data) filepath = Path(options["parse"]["outfile"]) - types["file"]["name"] = filepath.name outdir = filepath.parent suff = re.compile(r"\.[^\.]+$") if filepath.name.endswith(".gz"): stem = re.sub(suff, "", filepath.stem) else: stem = filepath.stem - tofile.write_file("%s/%s.types.yaml" % (outdir, stem), types) + if types: + types["file"]["name"] = filepath.name + tofile.write_file("%s/%s.types.yaml" % (outdir, stem), types) if names: names["file"]["name"] = filepath.name tofile.write_file("%s/%s.names.yaml" % (outdir, stem), names) diff --git a/src/genomehubs/lib/wikidata.py b/src/genomehubs/lib/wikidata.py index 79a632f8..d3c81d07 100644 --- a/src/genomehubs/lib/wikidata.py +++ b/src/genomehubs/lib/wikidata.py @@ -46,28 +46,34 @@ # } SOURCES = { - "BOLD": { + "bold": { "property": "P3606", - "source": "BOLD Systems taxon ID", + "source": "BOLD", + "display_name": "BOLD Systems taxon ID", "stub": "http://www.boldsystems.org/index.php/TaxBrowser_TaxonPage?taxid=", }, - "GBIF": { + "gbif": { "property": "P846", - "source": "GBIF taxonKey", + "source": "GBIF", + "display_name": "GBIF taxonKey", "stub": "https://www.gbif.org/species/", }, - "NBN": { + "nbn": { "property": "P3240", - "source": "NBN System Key", + "source": "NBN", + "display_name": "NBN System Key", "stub": "https://data.nbn.org.uk/Taxa/", }, - "NCBI": { + "ncbi": { "property": "P685", - "source": "NCBI taxonomy ID", + "source": "NCBI", + "display_name": "NCBI taxonomy ID", "stub": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=", }, - "WIKIDATA": { - "source": "Wikidata entity", + "wikidata": { + "property": None, + "source": "Wikidata", + "display_name": "Wikidata entity", "stub": "https://www.wikidata.org/wiki/", }, } @@ -189,17 +195,18 @@ def prepare_xref_rows(key, meta, entities): "subphylum", "phylum", ] - dbs = ["NCBI", "GBIF", "BOLD", "NBN"] lineage = meta["lineage"] rows = [] common = {} + for db in SOURCES.keys(): + common.update({db: "None"}) entity = key.replace(WD, "") for rank in ranks: if rank in lineage: common.update({rank: lineage[rank]}) - if SOURCES["NCBI"]["property"] in meta: - common.update({"ncbiTaxonId": meta[SOURCES["NCBI"]["property"]]}) - common.update({"taxonId": meta[SOURCES["NCBI"]["property"]]}) + if SOURCES["ncbi"]["property"] in meta: + common.update({"ncbiTaxonId": meta[SOURCES["ncbi"]["property"]]}) + common.update({"taxonId": meta[SOURCES["ncbi"]["property"]]}) else: common.update({"taxonId": entity}) if "P225" in meta: @@ -209,33 +216,36 @@ def prepare_xref_rows(key, meta, entities): common.update({rank: name}) common.update({"wikidataTaxonId": entity}) row = {**common} - row.update( - { - "xref": "%s:%s" % ("WIKIDATA", entity), - "source": SOURCES["WIKIDATA"]["source"], - "sourceStub": SOURCES["WIKIDATA"]["stub"], - "sourceSlug": entity, - } - ) - rows.append(row) - for db in dbs: - if SOURCES[db]["property"] in meta: + for db in SOURCES.keys(): + if SOURCES[db]["property"] is None or SOURCES[db]["property"] in meta: row = {**common} - slug = str(meta[SOURCES[db]["property"]]) + if SOURCES[db]["property"] is None: + slug = entity + else: + slug = str(meta[SOURCES[db]["property"]]) row.update( { - "xref": "%s:%s" % (db, slug), + db: slug, "source": SOURCES[db]["source"], "sourceStub": SOURCES[db]["stub"], - "sourceSlug": slug, } ) rows.append(row) return rows -def wikidata_parser(_params, opts): +def wikidata_parser(_params, opts, *, types=None, names=None): """Parse WikiData taxa and identifiers.""" + if names is None: + names = {} + if "taxon_names" not in names: + names["taxon_names"] = {} + for db, values in SOURCES.items(): + names["taxon_names"][db] = { + "display_name": values["display_name"], + "header": db, + "xref": True, + } parsed = [] entities, ranks = fetch_wikidata_rank_entities() roots = opts.get("wikidata-root", None) diff --git a/src/genomehubs/templates/xref.names.yaml b/src/genomehubs/templates/wikidata.names.yaml similarity index 78% rename from src/genomehubs/templates/xref.names.yaml rename to src/genomehubs/templates/wikidata.names.yaml index e2e2ddeb..a53bcc4f 100644 --- a/src/genomehubs/templates/xref.names.yaml +++ b/src/genomehubs/templates/wikidata.names.yaml @@ -1,9 +1,11 @@ file: format: tsv header: true -taxon_names: - wikidata_id: - header: wikidataTaxonId +metadata: + source: + header: source + source_stub: + header: sourceStub taxonomy: taxon_id: header: ncbiTaxonId diff --git a/src/genomehubs/templates/xref.types.yaml b/src/genomehubs/templates/xref.types.yaml deleted file mode 100644 index 4c0ead06..00000000 --- a/src/genomehubs/templates/xref.types.yaml +++ /dev/null @@ -1,38 +0,0 @@ -file: - display_group: xref - format: tsv - header: true -attributes: - xref: - description: External database references - display_group: xrefs - display_level: 1 - display_name: DB xrefs - header: xref - summary: list - traverse: false - type: keyword -metadata: - source: - header: source - source_slug: - header: sourceSlug - source_url_stub: - header: sourceStub -taxonomy: - taxon_id: - header: taxonId - phylum: - header: phylum - class: - header: class - order: - header: order - family: - header: family - genus: - header: genus - species: - header: species - subspecies: - header: subspecies From 48862b978c6f402a3534906d85e2cbbdc2dcc9d1 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 19 Mar 2021 15:47:25 +0000 Subject: [PATCH 09/21] add comment character(s) to ignore when indexing Fixes #54 --- src/genomehubs/lib/index.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index c10266b2..5db8e5c9 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -84,11 +84,24 @@ def not_blank(key, obj, blanks): return key in obj and obj[key] and obj[key] not in blanks +def strip_comments(data, types): + """Strip comment lines from a file stream.""" + comment_chars = {"#"} + if "file" in types and "comment" in types["file"]: + comment_chars.update(set(types["file"]["comment"])) + for row in data: + if row[0] in comment_chars: + continue + yield row + + def index_file(es, types, data, opts): """Index a file.""" delimiters = {"csv": ",", "tsv": "\t"} rows = csv.reader( - data, delimiter=delimiters[types["file"]["format"]], quotechar='"' + strip_comments(data, types), + delimiter=delimiters[types["file"]["format"]], + quotechar='"', ) if "header" in types["file"] and types["file"]["header"]: header = next(rows) From 2025061d04d03e981bf416afd283d74563cf262b Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 22 Mar 2021 14:01:04 +0000 Subject: [PATCH 10/21] Check spelling when indexing Fixes #58 --- src/genomehubs/lib/fill.py | 2 - src/genomehubs/lib/hub.py | 49 ++++++++++- src/genomehubs/lib/index.py | 10 ++- src/genomehubs/lib/taxon.py | 161 ++++++++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 28 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index c9f9d416..01ce066d 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -430,8 +430,6 @@ def copy_attribute_summary(source, meta): try: dest["%s_value" % meta["type"]] = source["%s_value" % meta["type"]] except KeyError as err: - print(source) - print(meta) raise (err) dest["count"] = source["count"] dest["key"] = source["key"] diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 64831ed7..22cff7b2 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -562,6 +562,53 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"): for row in rows: data.append(row) LOGGER.info( - "Writing %d records to %s file '%s", len(data) - header_len, label, outfile + "Writing %d records to %s file '%s'", len(data) - header_len, label, outfile ) tofile.write_file(outfile, data) + + +def write_spellchecked_taxa(spellings, opts, *, types, header=None): + """Write spellchecked taxa to file.""" + imported = [] + exceptions = [] + file_key = "%s-exception" % opts["index"] + dir_key = "%s-dir" % opts["index"] + filepath = Path(types["file"]["name"]) + extensions = "".join(filepath.suffixes) + file_basename = str(filepath).replace(extensions, "") + for name, matches in spellings.items(): + # enable test condition below if importing spellchecked taxa: + # if len(matches) == 1: + # imported.append([name, matches[0]]) + # else: + exceptions.append([name] + matches) + if imported: + label = "imported" + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + LOGGER.info( + "Writing %d spelling corrections to %s file '%s'", + len(imported), + label, + outfile, + ) + tofile.write_file(outfile, [["input", "corrected"]] + imported) + if exceptions: + label = "exceptions" + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + LOGGER.info( + "Writing %d spelling suggestions to %s file '%s'", + len(exceptions), + label, + outfile, + ) + tofile.write_file(outfile, [["input", "suggested"]] + exceptions) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 5db8e5c9..32ccea37 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -9,7 +9,8 @@ [--es-host URL...] [--assembly-dir PATH] [--assembly-repo URL] [--assembly-exception PATH] [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH] - [--taxon-lookup STRING] [--file PATH...] [file-dir PATH...] + [--taxon-lookup STRING] [--taxon-spellcheck] + [--file PATH...] [file-dir PATH...] [--remote-file URL...] [--remote-file-dir URL...] [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING] [--file-title STRING] [--file-description STRING] [--file-metadata PATH] @@ -26,7 +27,8 @@ --assembly-repo URL Remote git repository containing assembly-level data. Optionally include `~branch-name` suffix. --assembly-exception PATH Path to directory to write assembly data that failed to import. - --taxon-lookup STRING Taxon name class to lookup (scientific|all). [Default: scientific] + --taxon-lookup STRING Taxon name class to lookup (scientific|any). [Default: scientific] + --taxon-spellcheck Flag to use fuzzy matching to match taxon names. --taxon-dir PATH Path to directory containing taxon-level data. --taxon-repo URL Remote git repository containing taxon-level data. Optionally include `~branch-name` suffix. @@ -72,6 +74,7 @@ from .hub import set_column_indices from .hub import validate_types_file from .hub import write_imported_rows +from .hub import write_spellchecked_taxa from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids from .version import __version__ @@ -162,6 +165,7 @@ def index_file(es, types, data, opts): else: failed_rows["None"].append(row) LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys())) + spellings = {} create_ids, without_ids = fix_missing_ids( es, opts, @@ -173,7 +177,9 @@ def index_file(es, types, data, opts): with_ids=with_ids, blanks=blanks, header=header, + spellings=spellings, ) + write_spellchecked_taxa(spellings, opts, types=types, header=header) if with_ids or create_ids: write_imported_rows( imported_rows, opts, types=types, header=header, label="imported" diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 4302c7c7..372eadfe 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -76,11 +76,13 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"): def lookup_missing_taxon_ids( - es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]) + es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None ): """Lookup taxon ID based on available taxonomic information.""" if with_ids is None: with_ids = {} + if spellings is None: + spellings = {} # TODO: set this list from types file ranks = [ "subspecies", @@ -103,7 +105,7 @@ def lookup_missing_taxon_ids( if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue taxon_ids, name_class = lookup_taxon( - es, obj["taxonomy"][rank], opts, rank=rank + es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings ) if index == 1 and not taxon_ids: break @@ -180,15 +182,18 @@ def fix_missing_ids( with_ids=None, blanks=set(["NA", "None"]), header=None, + spellings=None, ): """Find or create taxon IDs for rows without.""" if with_ids is None: with_ids = {} + if spellings is None: + spellings = {} if without_ids: # TODO: support multiple taxonomies LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys())) with_ids, without_ids, found_ids = lookup_missing_taxon_ids( - es, without_ids, opts, with_ids=with_ids, blanks=blanks + es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings ) # create new taxon IDs if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: @@ -202,6 +207,7 @@ def fix_missing_ids( data=without_ids, blanks=blanks, taxon_template=taxon_template, + spellings=spellings, ) for created_id in created_ids: if created_id in without_ids: @@ -426,19 +432,53 @@ def lookup_taxon_within_lineage( return [] -def lookup_taxon( - es, name, opts, *, rank=None, name_class="scientific", return_type="taxon_id" -): - """Lookup taxon ID.""" - taxa = [] - template = index_template(opts["taxonomy-source"][0], opts) - body = { - "id": "taxon_by_name", - "params": {"taxon": name, "rank": rank}, +def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, return_type): + """Look up taxon name with fuzzy matching.""" + taxon_suggest = { + "id": "taxon_suggest", + "params": {"searchTerm": name, "max_errors": 3}, } - if name_class == "any": - body.update({"id": "taxon_by_any_name"}) - index = template["index_name"] + matches = None + with tolog.DisableLogger(): + suggestions = es.search_template( + body=taxon_suggest, index=index, rest_total_hits_as_int=True + ) + try: + options = suggestions["suggest"]["simple_phrase"][0]["options"] + matches = [ + option["text"] + for option in options + if option.get("collate_match", False) + ] + except KeyError: + return None + except ValueError: + return None + if matches and len(matches) > 1: + taxon_matches = {} + scientific_name = None + for match in matches: + body = { + "id": "taxon_by_any_name", + "params": {"taxon": match, "rank": rank}, + } + taxa = taxon_lookup( + es, body, index, taxonomy_index_template, opts, return_type="taxon" + ) + if len(taxa) > 1: + return matches + for taxon in taxa: + source = taxon["_source"] + taxon_matches[source["taxon_id"]] = source["scientific_name"] + scientific_name = source["scientific_name"] + if len(taxon_matches.keys()) == 1: + return [scientific_name] + return matches + + +def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type): + """Query elasticsearch for a taxon.""" + taxa = [] with tolog.DisableLogger(): res = es.search_template(body=body, index=index, rest_total_hits_as_int=True) if "hits" in res and res["hits"]["total"] > 0: @@ -458,9 +498,71 @@ def lookup_taxon( taxa = [hit["_source"]["taxon_id"] for hit in res["hits"]["hits"]] else: taxa = [hit for hit in res["hits"]["hits"]] - if not taxa and opts["taxon-lookup"] == "any" and name_class != "any": + return taxa + + +def lookup_taxon( + es, + name, + opts, + *, + rank=None, + name_class="scientific", + return_type="taxon_id", + spellings=None, +): + """Lookup taxon ID.""" + if spellings is None: + spellings = {} + template = index_template(opts["taxonomy-source"][0], opts) + index = template["index_name"] + body = { + "id": "taxon_by_name", + "params": {"taxon": name, "rank": rank}, + } + if name_class in {"any", "spellcheck"}: + body.update({"id": "taxon_by_any_name"}) + if name_class == "spellcheck": + matches = spellcheck_taxon( + es, name, index, rank, taxonomy_index_template, opts, return_type + ) + if matches: + spellings.update({name: matches}) + return [], name_class + # Uncomment code blow to use suggestion in current import + # if matches and len(matches) == 1: + # body["params"].update({"taxon": matches[0]}) + # else: + # return [], name_class + taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type) + if ( + not taxa + and opts["taxon-lookup"] == "any" + and name_class not in {"any", "spellcheck"} + ): taxa, name_class = lookup_taxon( - es, name, opts, rank=rank, name_class="any", return_type=return_type + es, + name, + opts, + rank=rank, + name_class="any", + return_type=return_type, + spellings=spellings, + ) + if ( + not taxa + and "taxon-spellcheck" in opts + and opts["taxon-spellcheck"] + and name_class != "spellcheck" + ): + taxa, name_class = lookup_taxon( + es, + name, + opts, + rank=rank, + name_class="spellcheck", + return_type=return_type, + spellings=spellings, ) return taxa, name_class @@ -533,8 +635,8 @@ def add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon, *, blanks={"NA", " return new_taxon -def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"])): - """Create new taxa using alternate taxon IDs.""" +def set_ranks(taxonomy): + """Set ranks for species/subspecies creation.""" default_ranks = [ "genus", "family", @@ -543,6 +645,20 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None" "subphylum", "phylum", ] + if "subspecies" in taxonomy: + ranks = ["species"] + default_ranks + else: + ranks = default_ranks + return ranks + + +def create_taxa( + es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None +): + """Create new taxa using alternate taxon IDs.""" + if spellings is None: + spellings = {} + ancestors = {} matches = defaultdict(dict) pbar = tqdm(total=len(data.keys())) @@ -556,15 +672,14 @@ def create_taxa(es, opts, *, taxon_template, data=None, blanks=set(["NA", "None" lineage = [] closest_rank = None closest_taxon = None - if "subspecies" in obj["taxonomy"]: - ranks = ["species"] + default_ranks - else: - ranks = default_ranks + ranks = set_ranks(obj["taxonomy"]) max_index = len(ranks) - 1 # max_rank = ranks[max_index] for index, rank in enumerate(ranks[: (max_index - 1)]): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue + if obj["taxonomy"][rank] in spellings: + break intermediates = 0 for anc_rank in ranks[(index + 1) :]: if ( From fcadf1523dff58e0d3166db85ea6f6ac8da8e31b Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 22 Mar 2021 15:08:55 +0000 Subject: [PATCH 11/21] ensure spellchecked taxon names are not indexed --- src/genomehubs/lib/taxon.py | 59 ++++++------------------------------- 1 file changed, 9 insertions(+), 50 deletions(-) diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 372eadfe..be30670f 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -645,11 +645,17 @@ def set_ranks(taxonomy): "subphylum", "phylum", ] + taxon_rank = None if "subspecies" in taxonomy: ranks = ["species"] + default_ranks + taxon_rank = "subspecies" else: ranks = default_ranks - return ranks + for rank in ["species"] + default_ranks: + if rank in taxonomy: + taxon_rank = rank + break + return ranks, taxon_rank def create_taxa( @@ -672,13 +678,13 @@ def create_taxa( lineage = [] closest_rank = None closest_taxon = None - ranks = set_ranks(obj["taxonomy"]) + ranks, taxon_rank = set_ranks(obj["taxonomy"]) max_index = len(ranks) - 1 # max_rank = ranks[max_index] for index, rank in enumerate(ranks[: (max_index - 1)]): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue - if obj["taxonomy"][rank] in spellings: + if obj["taxonomy"][taxon_rank] in spellings: break intermediates = 0 for anc_rank in ranks[(index + 1) :]: @@ -765,50 +771,3 @@ def create_taxa( stream_taxa(new_taxa), ) return new_taxa.keys() - - -# def parse_taxa(es, types, taxonomy_template): -# """Test method to parse taxa.""" -# taxa = [ -# { -# "taxon_id": 110368, -# "assembly_span": 12344567, -# "c_value": 2.5, -# "sex_determination_system": "N/A", -# }, -# { -# "taxon_id": 13037, -# "assembly_span": 2345678, -# "c_value": 2.3, -# "sex_determination_system": "XO", -# }, -# { -# "taxon_id": 113334, -# "assembly_span": 45678912, -# "c_value": 4.6, -# "sex_determination_system": "XY", -# }, -# ] -# for entry in taxa: -# # attributes = {} -# taxon_id = str(entry["taxon_id"]) -# doc = lookup_taxon_by_taxid(es, taxon_id, taxonomy_template) -# if doc is None: -# LOGGER.warning( -# "No %s taxonomy record for %s", -# taxonomy_template["index_name"], -# taxon_id, -# ) -# attributes = add_attributes(entry, types, attributes=[])[0] -# doc.update({"taxon_id": taxon_id, "attributes": attributes}) -# doc_id = "taxon_id-%s" % taxon_id -# yield doc_id, doc - - -# def index(es, opts, *, taxonomy_name="ncbi"): -# """Index a set of taxa.""" -# LOGGER.info("Indexing taxa using %s taxonomy", taxonomy_name) -# template = index_template(taxonomy_name, opts) -# taxonomy_template = taxonomy_index_template(taxonomy_name, opts) -# stream = parse_taxa(es, template["types"], taxonomy_template) -# return template, stream From cbbe66ab9491b9158b2103c311a88573edc65b9f Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 22 Mar 2021 16:57:30 +0000 Subject: [PATCH 12/21] allow multiple limit ranks (#53) --- src/genomehubs/lib/fill.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index c9f9d416..3aff541a 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -255,7 +255,7 @@ def set_values_from_descendants( continue traverse_limit = meta[key].get("traverse_limit", None) # TODO: #53 catch traverse limits when limit rank is missing - if traverse_limit and taxon_rank == traverse_limit: + if traverse_limit and taxon_rank in traverse_limit: limits[key].add(parent) try: attribute = next(entry for entry in attributes if entry["key"] == key) @@ -356,6 +356,11 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): ) root_depth = max_depth meta = template["types"]["attributes"] + for key, value in meta.items(): + if "traverse_limit" in value: + if not isinstance(value["traverse_limit"], list): + value["traverse_limit"] = [value["traverse_limit"]] + value["traverse_limit"] = set(value["traverse_limit"]) attrs = set(meta.keys()) parents = defaultdict( lambda: defaultdict( From c0c71aaec48130285ee96c56d31c2b6390370d0e Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 23 Mar 2021 09:57:41 +0000 Subject: [PATCH 13/21] track descendant ranks to fix leaky attributes Fixes #53 --- src/genomehubs/lib/fill.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index 3aff541a..e333cf9f 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -234,6 +234,7 @@ def set_values_from_descendants( parent, taxon_rank, parents, + descendant_ranks=None, attr_dict=None, limits=None ): @@ -249,14 +250,17 @@ def set_values_from_descendants( and meta[key]["traverse_direction"] == "down" ): traverseable = False - if not traverseable: - continue - if taxon_id in limits[key]: + if not traverseable or taxon_id in limits[key]: continue traverse_limit = meta[key].get("traverse_limit", None) - # TODO: #53 catch traverse limits when limit rank is missing - if traverse_limit and taxon_rank in traverse_limit: - limits[key].add(parent) + if traverse_limit: + if ( + descendant_ranks is not None + and traverse_limit in descendant_ranks[taxon_id] + ): + continue + if taxon_rank == traverse_limit: + limits[key].add(parent) try: attribute = next(entry for entry in attributes if entry["key"] == key) except StopIteration: @@ -344,6 +348,12 @@ def track_missing_attribute_values( ) +def track_descendant_ranks(node, descendant_ranks): + """Keep track of descendant ranks.""" + if "parent" in node["_source"]: + descendant_ranks[node["_source"]["parent"]].add(node["_source"]["taxon_rank"]) + + def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): """Traverse a tree, filling in values.""" if root is None: @@ -356,11 +366,11 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): ) root_depth = max_depth meta = template["types"]["attributes"] - for key, value in meta.items(): - if "traverse_limit" in value: - if not isinstance(value["traverse_limit"], list): - value["traverse_limit"] = [value["traverse_limit"]] - value["traverse_limit"] = set(value["traverse_limit"]) + # for key, value in meta.items(): + # if "traverse_limit" in value: + # if not isinstance(value["traverse_limit"], list): + # value["traverse_limit"] = [value["traverse_limit"]] + # value["traverse_limit"] = set(value["traverse_limit"]) attrs = set(meta.keys()) parents = defaultdict( lambda: defaultdict( @@ -371,6 +381,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): if "traverse-infer-both" in opts and opts["traverse-infer-both"]: desc_attrs, desc_attr_limits = set_attributes_to_descend(meta) missing_attributes = defaultdict(dict) + descendant_ranks = defaultdict(set) else: desc_attrs = {} while root_depth >= 0: @@ -384,6 +395,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): ctr = 0 for node in nodes: # TODO: break into sub functions + track_descendant_ranks(node, descendant_ranks) ctr += 1 changed = False attr_dict = {} @@ -405,6 +417,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): taxon_id=node["_source"]["taxon_id"], parent=node["_source"].get("parent", None), parents=parents, + descendant_ranks=descendant_ranks, taxon_rank=node["_source"]["taxon_rank"], attr_dict=attr_dict, limits=limits, From 05a968dc3ab002ead08a2cc0d86300191e97f720 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 23 Mar 2021 12:17:55 +0000 Subject: [PATCH 14/21] include ancestral rank in aggregation source Fixes #51 --- src/genomehubs/lib/fill.py | 10 +++++++--- src/genomehubs/templates/taxon.json | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index a25ed1aa..17b7c1b4 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -323,10 +323,14 @@ def track_missing_attribute_values( for child_id, obj in missing_attributes[node["_source"]["taxon_id"]].items(): for key, attribute in attr_dict.items(): if key in obj["keys"]: - # update aggregation source here - # TODO: #51 include ancestral rank in aggregation source + # update aggregation source to include ancestral rank obj["attributes"].append( - {**attribute, "aggregation_source": "ancestor"} + { + **attribute, + "aggregation_source": "ancestor", + "aggregation_rank": node["_source"]["taxon_rank"], + "aggregation_taxon_id": node["_source"]["taxon_id"], + } ) obj["keys"].remove(key) if obj["keys"]: diff --git a/src/genomehubs/templates/taxon.json b/src/genomehubs/templates/taxon.json index 103d5a9d..27f28cf5 100644 --- a/src/genomehubs/templates/taxon.json +++ b/src/genomehubs/templates/taxon.json @@ -280,6 +280,22 @@ "description": "Summary source (direct, ancestor, descendant)" } }, + "aggregation_rank": { + "type": "keyword", + "ignore_above": 16, + "normalizer": "lowercase", + "meta": { + "description": "Source rank for ancestor derived values" + } + }, + "aggregation_taxon_id": { + "type": "keyword", + "ignore_above": 16, + "normalizer": "lowercase", + "meta": { + "description": "Source taxon_id for ancestor derived values" + } + }, "comment": { "type": "text", "index": false From d7b3f49094ede40e12e51a12fcbb8e919567af36 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 23 Mar 2021 15:09:16 +0000 Subject: [PATCH 15/21] Use preferred values in aggregation Fixes #62 --- src/genomehubs/lib/fill.py | 43 +++++++++++++++++++++++++++++--------- src/genomehubs/lib/hub.py | 4 ++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index 17b7c1b4..2231ef52 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -119,7 +119,15 @@ def stream_descendant_nodes_missing_attributes(es, *, index, attributes, root, s yield result -def apply_summary(summary, values, *, max_value=None, min_value=None): +def apply_summary( + summary, + values, + *, + primary_values=None, + summary_types=None, + max_value=None, + min_value=None +): """Apply summary statistic functions.""" summaries = { "count": len, @@ -134,6 +142,10 @@ def apply_summary(summary, values, *, max_value=None, min_value=None): "list": list, } flattened = [] + if summary == "primary": + if primary_values: + values = primary_values + summary = summary_types[0] for v in values: if isinstance(v, list): flattened += v @@ -159,9 +171,14 @@ def summarise_attribute_values( return None, None, None if "summary" in meta: value_type = "%s_value" % meta["type"] + primary_values = [] if "values" in attribute: if values is None: - values = [value[value_type] for value in attribute["values"]] + values = [] + for value in attribute["values"]: + values.append(value[value_type]) + if "is_primary_value" in value and value["is_primary_value"]: + primary_values.append(value[value_type]) else: values += [value[value_type] for value in attribute["values"]] if not values: @@ -171,16 +188,23 @@ def summarise_attribute_values( traverse_value = None if not isinstance(meta["summary"], list): meta["summary"] = [meta["summary"]] - for summary in meta["summary"]: + for index, summary in enumerate(meta["summary"]): value, max_value, min_value = apply_summary( - summary, values, max_value=max_value, min_value=min_value + summary, + values, + primary_values=primary_values, + summary_types=meta["summary"][index + 1 :] + ["median"], + max_value=max_value, + min_value=min_value, ) if idx == 0: - attribute[value_type] = value - attribute["count"] = len(values) - attribute["aggregation_method"] = summary - attribute["aggregation_source"] = "direct" - traverse_value = value + if value is not None: + attribute[value_type] = value + attribute["count"] = len(values) + attribute["aggregation_method"] = summary + attribute["aggregation_source"] = "direct" + traverse_value = value + idx += 1 elif traverse and summary == traverse: traverse_value = value if summary != "list": @@ -188,7 +212,6 @@ def summarise_attribute_values( summary = "median" else: traverse_value = list(set(traverse_value)) - idx += 1 return traverse_value, max_value, min_value return None, None, None diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 22cff7b2..bd1d5cc0 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -506,6 +506,10 @@ def process_row(types, row): raise err taxon_data = {} taxon_types = {} + if "is_primary_value" in data["metadata"]: + data["metadata"]["is_primary_value"] = bool( + int(data["metadata"]["is_primary_value"]) + ) for attr_type in list(["attributes", "identifiers"]): if attr_type in data and data[attr_type]: ( From bfddbc4aedaabfbfca588ff941c6b53955b6a5ac Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 24 Mar 2021 09:11:37 +0000 Subject: [PATCH 16/21] include max and min in attributes --- src/genomehubs/lib/fill.py | 43 +++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index 2231ef52..e3d77dc5 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -212,6 +212,9 @@ def summarise_attribute_values( summary = "median" else: traverse_value = list(set(traverse_value)) + if isinstance(max_value, float) or isinstance(max_value, int): + attribute["max"] = max_value + attribute["min"] = min_value return traverse_value, max_value, min_value return None, None, None @@ -238,13 +241,19 @@ def summarise_attributes(*, attributes, attrs, meta, parent, parents): summary_value ) if max_value is not None: - parents[parent][node_attribute["key"]]["max"] = max( - parents[parent][node_attribute["key"]]["max"], max_value - ) + if parents[parent][node_attribute["key"]]["max"] is not None: + parents[parent][node_attribute["key"]]["max"] = max( + parents[parent][node_attribute["key"]]["max"], max_value + ) + else: + parents[parent][node_attribute["key"]]["max"] = max_value if min_value is not None: - parents[parent][node_attribute["key"]]["min"] = min( - parents[parent][node_attribute["key"]]["min"], min_value - ) + if parents[parent][node_attribute["key"]]["min"] is not None: + parents[parent][node_attribute["key"]]["min"] = min( + parents[parent][node_attribute["key"]]["min"], min_value + ) + else: + parents[parent][node_attribute["key"]]["min"] = min_value return changed, attr_dict @@ -308,13 +317,19 @@ def set_values_from_descendants( else: parents[parent][key]["values"].append(summary_value) if max_value is not None: - parents[parent][key]["max"] = max( - parents[parent][key]["max"], max_value - ) + if parents[parent][key]["max"] is not None: + parents[parent][key]["max"] = max( + parents[parent][key]["max"], max_value + ) + else: + parents[parent][key]["max"] = max_value if min_value is not None: - parents[parent][key]["min"] = min( - parents[parent][key]["min"], min_value - ) + if parents[parent][key]["min"] is not None: + parents[parent][key]["min"] = min( + parents[parent][key]["min"], min_value + ) + else: + parents[parent][key]["min"] = min_value return changed, attr_dict @@ -400,9 +415,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): # value["traverse_limit"] = set(value["traverse_limit"]) attrs = set(meta.keys()) parents = defaultdict( - lambda: defaultdict( - lambda: {"max": float("-inf"), "min": float("inf"), "values": []} - ) + lambda: defaultdict(lambda: {"max": None, "min": None, "values": []}) ) limits = defaultdict(set) if "traverse-infer-both" in opts and opts["traverse-infer-both"]: From c13edd5b9e25369d5a3c5ab1148038b1c77a6c31 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 24 Mar 2021 15:48:07 +0000 Subject: [PATCH 17/21] fix rows to imported and exceptions files --- src/genomehubs/lib/taxon.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index be30670f..b33b0a72 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -214,7 +214,7 @@ def fix_missing_ids( with_ids[created_id] = without_ids[created_id] found_ids[created_id] = True del without_ids[created_id] - if without_ids and failed_rows: + if failed_rows: for key, value in found_ids.items(): if key in failed_rows: imported_rows += failed_rows[key] @@ -670,11 +670,16 @@ def create_taxa( pbar = tqdm(total=len(data.keys())) taxon_ids = set({}) new_taxa = {} - for alt_taxon_id, rows in data.items(): + for rows in data.values(): obj = rows[0] pbar.update(1) - if "taxonomy" not in obj: + if ( + "taxonomy" not in obj + or "alt_taxon_id" not in obj["taxonomy"] + or obj["taxonomy"]["alt_taxon_id"] in blanks + ): continue + alt_taxon_id = obj["taxonomy"]["alt_taxon_id"] lineage = [] closest_rank = None closest_taxon = None @@ -684,7 +689,7 @@ def create_taxa( for index, rank in enumerate(ranks[: (max_index - 1)]): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue - if obj["taxonomy"][taxon_rank] in spellings: + if obj["taxonomy"][rank] in spellings: break intermediates = 0 for anc_rank in ranks[(index + 1) :]: From 05cad18fd5db9ac8cde0e236dc3c2eecac9b1a4b Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Thu, 25 Mar 2021 10:37:48 +0000 Subject: [PATCH 18/21] fix alt_taxon_id spellcheck import/exceptions --- src/genomehubs/lib/taxon.py | 91 +++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index b33b0a72..8cf8ae14 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -658,6 +658,41 @@ def set_ranks(taxonomy): return ranks, taxon_rank +def create_new_taxon( + alt_taxon_id, + closest_taxon, + closest_rank, + lineage, + new_taxa, + taxon_ids, + obj, + matches, + taxa, + ancestors, +): + """Create a new taxon with new ancestral taxa as required.""" + if closest_taxon is not None: + for intermediate in reversed(lineage): + taxon_id = generate_ancestral_taxon_id( + intermediate["name"], + intermediate["rank"], + alt_taxon_id=alt_taxon_id, + taxon_ids=taxon_ids, + ) + new_taxon = create_descendant_taxon( + taxon_id, intermediate["rank"], intermediate["name"], closest_taxon + ) + new_taxa.update({new_taxon["_source"]["taxon_id"]: new_taxon["_source"]}) + matches[intermediate["name"]][obj["taxonomy"][closest_rank]] = taxa + closest_rank = intermediate["rank"] + closest_taxon = new_taxon + ancestors[alt_taxon_id] = closest_taxon + added_taxon = add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon) + matches[added_taxon["_source"]["scientific_name"]][ + closest_taxon["_source"]["scientific_name"] + ] = [added_taxon] + + def create_taxa( es, opts, *, taxon_template, data=None, blanks=set(["NA", "None"]), spellings=None ): @@ -678,34 +713,49 @@ def create_taxa( or "alt_taxon_id" not in obj["taxonomy"] or obj["taxonomy"]["alt_taxon_id"] in blanks ): + # row has no alt_taxon_id continue alt_taxon_id = obj["taxonomy"]["alt_taxon_id"] lineage = [] closest_rank = None closest_taxon = None + # fetch ancestral ranks and current taxon rank ranks, taxon_rank = set_ranks(obj["taxonomy"]) + if ( + taxon_rank not in obj["taxonomy"] + or obj["taxonomy"][taxon_rank] in blanks + or obj["taxonomy"][taxon_rank] in spellings + ): + # taxon name is missing or may be mis-spelled + continue max_index = len(ranks) - 1 - # max_rank = ranks[max_index] + # loop through lineage to find existing ancestral taxa for index, rank in enumerate(ranks[: (max_index - 1)]): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: + # row has no name at this rank continue if obj["taxonomy"][rank] in spellings: + # ancestral taxon name is missing or may be mis-spelled break intermediates = 0 + # loop through higher ranks to disambiguate name clashes for anc_rank in ranks[(index + 1) :]: if ( anc_rank not in obj["taxonomy"] or obj["taxonomy"][anc_rank] in blanks ): + # row has no name at this rank continue if ( obj["taxonomy"][rank] in matches and obj["taxonomy"][anc_rank] in matches[obj["taxonomy"][rank]] ): + # this taxon has been seen before taxa = matches[obj["taxonomy"][rank]][obj["taxonomy"][anc_rank]] ancestors.update({alt_taxon_id: taxa[0]}) break else: + #  find existing ancestral taxa within a lineage taxa = lookup_taxon_within_lineage( es, obj["taxonomy"][rank], @@ -717,10 +767,10 @@ def create_taxa( ) if taxa: if len(taxa) == 1: + #  unambiguous match to a single existing taxon ancestors.update({alt_taxon_id: taxa[0]}) matches[obj["taxonomy"][rank]][obj["taxonomy"][anc_rank]] = taxa break - # elif anc_rank == max_rank and intermediates == 0: elif intermediates == 0: taxa, name_class = lookup_taxon( es, @@ -747,32 +797,25 @@ def create_taxa( closest_taxon = matches[obj["taxonomy"][anc_rank]]["all"][0] break lineage.append({"rank": rank, "name": obj["taxonomy"][rank]}) - if closest_taxon is not None: - for intermediate in reversed(lineage): - taxon_id = generate_ancestral_taxon_id( - intermediate["name"], - intermediate["rank"], - alt_taxon_id=alt_taxon_id, - taxon_ids=taxon_ids, - ) - new_taxon = create_descendant_taxon( - taxon_id, intermediate["rank"], intermediate["name"], closest_taxon - ) - new_taxa.update( - {new_taxon["_source"]["taxon_id"]: new_taxon["_source"]} - ) - matches[intermediate["name"]][obj["taxonomy"][closest_rank]] = taxa - closest_rank = intermediate["rank"] - closest_taxon = new_taxon - ancestors[alt_taxon_id] = closest_taxon - added_taxon = add_new_taxon(alt_taxon_id, new_taxa, obj, closest_taxon) - matches[added_taxon["_source"]["scientific_name"]][ - closest_taxon["_source"]["scientific_name"] - ] = [added_taxon] + # create a new taxon if a closest ancestral taxon could be found + create_new_taxon( + alt_taxon_id, + closest_taxon, + closest_rank, + lineage, + new_taxa, + taxon_ids, + obj, + matches, + taxa, + ancestors, + ) pbar.close() + # add new taxa to the index index_stream( es, taxon_template["index_name"], stream_taxa(new_taxa), ) + # return a list of alt_taxon_ids for the created taxa return new_taxa.keys() From bde7c56aef9c35c7efae9dcea78c83224c2b97bc Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 6 Apr 2021 13:43:11 +0100 Subject: [PATCH 19/21] Set default traverse limit to class Fixes #64 --- src/genomehubs/lib/fill.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/genomehubs/lib/fill.py b/src/genomehubs/lib/fill.py index e3d77dc5..d042c554 100644 --- a/src/genomehubs/lib/fill.py +++ b/src/genomehubs/lib/fill.py @@ -6,7 +6,7 @@ Usage: genomehubs fill [--hub-name STRING] [--hub-path PATH] [--hub-version PATH] [--config-file PATH...] [--config-save PATH] - [--es-host URL...] + [--es-host URL...] [--traverse-limit STRING] [--traverse-infer-ancestors] [--traverse-infer-descendants] [--traverse-infer-both] [--traverse-threads INT] [--traverse-depth INT] [--traverse-root STRING] @@ -25,6 +25,7 @@ --traverse-infer-descendants Flag to enable tree traversal from root to tips. --traverse-infer-both Flag to enable tree traversal from tips to root and back to tips. + --traverse-limit STRING Maximum rank to ascend to during traversal. [Default: class] --traverse-root ID Root taxon id for tree traversal. --traverse-threads INT Number of threads to use for tree traversal. [Default: 1] --traverse-weight STRING Weighting scheme for setting values during tree @@ -265,6 +266,7 @@ def set_values_from_descendants( taxon_id, parent, taxon_rank, + traverse_limit, parents, descendant_ranks=None, attr_dict=None, @@ -284,7 +286,7 @@ def set_values_from_descendants( traverseable = False if not traverseable or taxon_id in limits[key]: continue - traverse_limit = meta[key].get("traverse_limit", None) + traverse_limit = meta[key].get("traverse_limit", traverse_limit) if traverse_limit: if ( descendant_ranks is not None @@ -333,7 +335,7 @@ def set_values_from_descendants( return changed, attr_dict -def set_attributes_to_descend(meta): +def set_attributes_to_descend(meta, traverse_limit): """Set which attributes should have values inferred from ancestral taxa.""" desc_attrs = set() desc_attr_limits = {} @@ -346,6 +348,8 @@ def set_attributes_to_descend(meta): desc_attrs.add(key) if "traverse_limit" in value: desc_attr_limits.update({key: value["traverse_limit"]}) + else: + desc_attr_limits.update({key: traverse_limit}) return desc_attrs, desc_attr_limits @@ -408,18 +412,15 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): ) root_depth = max_depth meta = template["types"]["attributes"] - # for key, value in meta.items(): - # if "traverse_limit" in value: - # if not isinstance(value["traverse_limit"], list): - # value["traverse_limit"] = [value["traverse_limit"]] - # value["traverse_limit"] = set(value["traverse_limit"]) attrs = set(meta.keys()) parents = defaultdict( lambda: defaultdict(lambda: {"max": None, "min": None, "values": []}) ) limits = defaultdict(set) if "traverse-infer-both" in opts and opts["traverse-infer-both"]: - desc_attrs, desc_attr_limits = set_attributes_to_descend(meta) + desc_attrs, desc_attr_limits = set_attributes_to_descend( + meta, opts["traverse-limit"] + ) missing_attributes = defaultdict(dict) descendant_ranks = defaultdict(set) else: @@ -459,6 +460,7 @@ def traverse_from_tips(es, opts, *, template, root=None, max_depth=None): parents=parents, descendant_ranks=descendant_ranks, taxon_rank=node["_source"]["taxon_rank"], + traverse_limit=opts["traverse-limit"], attr_dict=attr_dict, limits=limits, ) From 32f55ab793c66fc3ee84b2c8c50203d4ba91ad2e Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 7 Apr 2021 09:21:24 +0100 Subject: [PATCH 20/21] Write taxon IDs to imported/exceptions files Fixes #66 --- src/genomehubs/lib/hub.py | 128 ++++++++++++++++++++++++++---------- src/genomehubs/lib/index.py | 39 ++++++----- src/genomehubs/lib/taxon.py | 22 ++++--- 3 files changed, 132 insertions(+), 57 deletions(-) diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index bd1d5cc0..f0c3596f 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 """Hub functions.""" +import csv import os import re import sys +from collections import defaultdict from copy import deepcopy from pathlib import Path @@ -422,6 +424,36 @@ def add_attribute_values(existing, new, *, raw=True): ) +def strip_comments(data, types): + """Strip comment lines from a file stream.""" + comment_chars = {"#"} + if "file" in types and "comment" in types["file"]: + comment_chars.update(set(types["file"]["comment"])) + for row in data: + if row[0] in comment_chars: + continue + yield row + + +def process_names_file(types, names_file): + """Process a taxon names file.""" + data = tofile.open_file_handle(names_file) + names = defaultdict(dict) + if data is None: + return names + delimiters = {"csv": ",", "tsv": "\t"} + rows = csv.reader( + strip_comments(data, types), + delimiter=delimiters[types["file"]["format"]], + quotechar='"', + ) + next(rows) + for row in rows: + name = row[3] if len(row) > 3 else row[1] + names[row[2]][row[1]] = {"name": name, "taxon_id": row[0]} + return names + + def validate_types_file(types_file, dir_path): """Validate types file.""" try: @@ -441,7 +473,8 @@ def validate_types_file(types_file, dir_path): defaults["metadata"].update({key: value}) types.update({"defaults": defaults}) data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"]) - return types, data + names = process_names_file(types, Path(dir_path) / "names" / types["file"]["name"]) + return types, data, names def set_xrefs(taxon_names, types, row, *, meta=None): @@ -460,16 +493,8 @@ def set_xrefs(taxon_names, types, row, *, meta=None): return names -def process_row(types, row): - """Process a row of data.""" - data = { - "attributes": {}, - "identifiers": {}, - "metadata": {}, - "taxon_names": {}, - "taxonomy": {}, - "taxon_attributes": {}, - } +def set_row_defaults(types, data): + """Set default values for a row.""" for key in types["defaults"].keys(): if key in types: for entry in types[key].values(): @@ -479,6 +504,10 @@ def process_row(types, row): } elif key == "metadata": data["metadata"] = {**types["defaults"]["metadata"]} + + +def process_row_values(row, types, data): + """Process row values.""" for group in data.keys(): if group in types: for key, meta in types[group].items(): @@ -504,6 +533,20 @@ def process_row(types, row): except Exception as err: LOGGER.warning("Cannot parse row '%s'" % str(row)) raise err + + +def process_row(types, names, row): + """Process a row of data.""" + data = { + "attributes": {}, + "identifiers": {}, + "metadata": {}, + "taxon_names": {}, + "taxonomy": {}, + "taxon_attributes": {}, + } + set_row_defaults(types, data) + process_row_values(row, types, data) taxon_data = {} taxon_types = {} if "is_primary_value" in data["metadata"]: @@ -524,10 +567,18 @@ def process_row(types, row): ) else: data[attr_type] = [] - if "taxon_names" in data and data["taxon_names"]: + if data["taxon_names"]: data["taxon_names"] = set_xrefs( data["taxon_names"], types["taxon_names"], row, meta=data["metadata"] ) + if data["taxonomy"] and names: + for key in names.keys(): + if key in data["taxonomy"]: + if data["taxonomy"][key] in names[key]: + data["taxonomy"]["taxon_id"] = names[key][data["taxonomy"][key]][ + "taxon_id" + ] + data["taxonomy"][key] = names[key][data["taxonomy"][key]]["name"] return data, taxon_data, taxon_types.get("attributes", {}) @@ -571,23 +622,18 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"): tofile.write_file(outfile, data) -def write_spellchecked_taxa(spellings, opts, *, types, header=None): +def write_spellchecked_taxa(spellings, opts, *, types): """Write spellchecked taxa to file.""" - imported = [] exceptions = [] file_key = "%s-exception" % opts["index"] dir_key = "%s-dir" % opts["index"] filepath = Path(types["file"]["name"]) extensions = "".join(filepath.suffixes) file_basename = str(filepath).replace(extensions, "") - for name, matches in spellings.items(): - # enable test condition below if importing spellchecked taxa: - # if len(matches) == 1: - # imported.append([name, matches[0]]) - # else: - exceptions.append([name] + matches) - if imported: - label = "imported" + for name, obj in spellings.items(): + exceptions.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"]) + if exceptions: + label = "exceptions" if file_key in opts and opts[file_key]: outdir = opts[file_key] else: @@ -595,24 +641,40 @@ def write_spellchecked_taxa(spellings, opts, *, types, header=None): os.makedirs(outdir, exist_ok=True) outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) LOGGER.info( - "Writing %d spelling corrections to %s file '%s'", - len(imported), + "Writing %d spelling suggestions to %s file '%s'", + len(exceptions), label, outfile, ) - tofile.write_file(outfile, [["input", "corrected"]] + imported) - if exceptions: - label = "exceptions" + tofile.write_file( + outfile, [["taxon_id", "input", "rank", "suggested"]] + exceptions + ) + + +def write_imported_taxa(taxa, opts, *, types): + """Write imported taxa to file.""" + imported = [] + file_key = "%s-exception" % opts["index"] + dir_key = "%s-dir" % opts["index"] + filepath = Path(types["file"]["name"]) + extensions = "".join(filepath.suffixes) + file_basename = str(filepath).replace(extensions, "") + for name, arr in taxa.items(): + prefix = "#" if len(arr) > 1 else "" + for obj in arr: + imported.append( + ["%s%s" % (prefix, str(obj["taxon_id"])), name, obj["rank"]] + ) + if imported: if file_key in opts and opts[file_key]: outdir = opts[file_key] else: - outdir = "%s/%s" % (opts[dir_key], label) + outdir = "%s/imported" % opts[dir_key] os.makedirs(outdir, exist_ok=True) - outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + outfile = "%s/%s" % (outdir, "%s.taxon_ids.tsv" % file_basename) LOGGER.info( - "Writing %d spelling suggestions to %s file '%s'", - len(exceptions), - label, + "Writing %d taxon_ids to imported file '%s'", + len(imported), outfile, ) - tofile.write_file(outfile, [["input", "suggested"]] + exceptions) + tofile.write_file(outfile, [["taxon_id", "input", "rank"]] + imported) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 32ccea37..75766801 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -72,8 +72,10 @@ from .files import index_metadata from .hub import process_row from .hub import set_column_indices +from .hub import strip_comments from .hub import validate_types_file from .hub import write_imported_rows +from .hub import write_imported_taxa from .hub import write_spellchecked_taxa from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids @@ -87,18 +89,19 @@ def not_blank(key, obj, blanks): return key in obj and obj[key] and obj[key] not in blanks -def strip_comments(data, types): - """Strip comment lines from a file stream.""" - comment_chars = {"#"} - if "file" in types and "comment" in types["file"]: - comment_chars.update(set(types["file"]["comment"])) - for row in data: - if row[0] in comment_chars: - continue - yield row +def summarise_imported_taxa(docs, imported_taxa): + """Summarise taxon imformation from a stram of taxon docs.""" + for entry_id, entry in docs: + imported_taxa[entry["scientific_name"]].append( + { + "taxon_id": entry["taxon_id"], + "rank": entry["taxon_rank"], + } + ) + yield entry_id, entry -def index_file(es, types, data, opts): +def index_file(es, types, names, data, opts): """Index a file.""" delimiters = {"csv": ",", "tsv": "\t"} rows = csv.reader( @@ -123,7 +126,9 @@ def index_file(es, types, data, opts): LOGGER.info("Processing rows") for row in tqdm(rows): try: - processed_data, taxon_data, new_taxon_types = process_row(types, row) + processed_data, taxon_data, new_taxon_types = process_row( + types, names, row + ) except Exception as err: print(err) failed_rows["None"].append(row) @@ -179,7 +184,7 @@ def index_file(es, types, data, opts): header=header, spellings=spellings, ) - write_spellchecked_taxa(spellings, opts, types=types, header=header) + write_spellchecked_taxa(spellings, opts, types=types) if with_ids or create_ids: write_imported_rows( imported_rows, opts, types=types, header=header, label="imported" @@ -189,12 +194,14 @@ def index_file(es, types, data, opts): docs = add_names_and_attributes_to_taxa( es, dict(with_ids), opts, template=taxon_template, blanks=blanks ) + imported_taxa = defaultdict(list) index_stream( es, taxon_template["index_name"], - docs, + summarise_imported_taxa(docs, imported_taxa), _op_type="update", ) + write_imported_taxa(imported_taxa, opts, types=types) elif opts["index"] == "assembly": # TODO: keep track of taxon_id not found exceptions assembly_template = assembly.index_template(taxonomy_name, opts) @@ -244,22 +251,24 @@ def main(args): if data_dir in options["index"]: dir_path = options["index"][data_dir] for types_file in sorted(Path(dir_path).glob("*.names.yaml")): - types, data = validate_types_file(types_file, dir_path) + types, data, names = validate_types_file(types_file, dir_path) LOGGER.info("Indexing %s" % types["file"]["name"]) index_types(es, index, types, options["index"]) index_file( es, types, + names, data, {**options["index"], "index": index, "index_types": index_types}, ) for types_file in sorted(Path(dir_path).glob("*.types.yaml")): - types, data = validate_types_file(types_file, dir_path) + types, data, names = validate_types_file(types_file, dir_path) LOGGER.info("Indexing %s" % types["file"]["name"]) index_types(es, index, types, options["index"]) index_file( es, types, + names, data, {**options["index"], "index": index, "index_types": index_types}, ) diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 8cf8ae14..ba93a25f 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -451,10 +451,11 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur if option.get("collate_match", False) ] except KeyError: - return None + return None, rank, None except ValueError: - return None - if matches and len(matches) > 1: + return None, rank, None + taxon_id = None + if matches: taxon_matches = {} scientific_name = None for match in matches: @@ -466,14 +467,15 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur es, body, index, taxonomy_index_template, opts, return_type="taxon" ) if len(taxa) > 1: - return matches + return None, rank, matches for taxon in taxa: source = taxon["_source"] - taxon_matches[source["taxon_id"]] = source["scientific_name"] + taxon_id = source["taxon_id"] + taxon_matches[taxon_id] = source["scientific_name"] scientific_name = source["scientific_name"] if len(taxon_matches.keys()) == 1: - return [scientific_name] - return matches + return taxon_id, rank, [scientific_name] + return None, rank, matches def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type): @@ -523,11 +525,13 @@ def lookup_taxon( if name_class in {"any", "spellcheck"}: body.update({"id": "taxon_by_any_name"}) if name_class == "spellcheck": - matches = spellcheck_taxon( + taxon_id, rank, matches = spellcheck_taxon( es, name, index, rank, taxonomy_index_template, opts, return_type ) if matches: - spellings.update({name: matches}) + spellings.update( + {name: {"matches": matches, "taxon_id": taxon_id, "rank": rank}} + ) return [], name_class # Uncomment code blow to use suggestion in current import # if matches and len(matches) == 1: From 073702eec6b19e9b6b8b31e5dec881d42c5da9f7 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 7 Apr 2021 16:44:18 +0100 Subject: [PATCH 21/21] add in memory taxon name lookup Fixes #68 --- src/genomehubs/lib/hub.py | 42 ++-- src/genomehubs/lib/index.py | 227 +++++++++--------- src/genomehubs/lib/taxon.py | 192 ++++++++++++--- .../templates/scripts/taxon_names.json | 18 ++ .../scripts/taxon_names_by_root.json | 26 ++ 5 files changed, 345 insertions(+), 160 deletions(-) create mode 100644 src/genomehubs/templates/scripts/taxon_names.json create mode 100644 src/genomehubs/templates/scripts/taxon_names_by_root.json diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index f0c3596f..a068f9a1 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -624,31 +624,31 @@ def write_imported_rows(rows, opts, *, types, header=None, label="imported"): def write_spellchecked_taxa(spellings, opts, *, types): """Write spellchecked taxa to file.""" - exceptions = [] - file_key = "%s-exception" % opts["index"] dir_key = "%s-dir" % opts["index"] filepath = Path(types["file"]["name"]) extensions = "".join(filepath.suffixes) file_basename = str(filepath).replace(extensions, "") - for name, obj in spellings.items(): - exceptions.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"]) - if exceptions: - label = "exceptions" - if file_key in opts and opts[file_key]: - outdir = opts[file_key] - else: - outdir = "%s/%s" % (opts[dir_key], label) - os.makedirs(outdir, exist_ok=True) - outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) - LOGGER.info( - "Writing %d spelling suggestions to %s file '%s'", - len(exceptions), - label, - outfile, - ) - tofile.write_file( - outfile, [["taxon_id", "input", "rank", "suggested"]] + exceptions - ) + dirs = { + "spellcheck": "exceptions", + "synonym": "imported", + } + for group in dirs.keys(): + taxa = [] + for name, obj in spellings[group].items(): + taxa.append([obj["taxon_id"], name, obj["rank"]] + obj["matches"]) + if taxa: + outdir = "%s/%s" % (opts[dir_key], dirs[group]) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, "%s.spellcheck.tsv" % file_basename) + LOGGER.info( + "Writing %d %s suggestions to spellcheck file '%s'", + len(taxa), + group, + outfile, + ) + tofile.write_file( + outfile, [["taxon_id", "input", "rank", "suggested"]] + taxa + ) def write_imported_taxa(taxa, opts, *, types): diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 75766801..1b18840d 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -9,7 +9,9 @@ [--es-host URL...] [--assembly-dir PATH] [--assembly-repo URL] [--assembly-exception PATH] [--taxon-dir PATH] [--taxon-repo URL] [--taxon-exception PATH] - [--taxon-lookup STRING] [--taxon-spellcheck] + [--taxon-lookup STRING] [--taxon-lookup-root STRING] + [--taxon-lookup-in-memory] + [--taxon-spellcheck] [--file PATH...] [file-dir PATH...] [--remote-file URL...] [--remote-file-dir URL...] [--taxon-id STRING] [--assembly-id STRING] [--analysis-id STRING] @@ -27,7 +29,9 @@ --assembly-repo URL Remote git repository containing assembly-level data. Optionally include `~branch-name` suffix. --assembly-exception PATH Path to directory to write assembly data that failed to import. + --taxon-lookup-root STRING Root taxon Id for in-memory lookup. --taxon-lookup STRING Taxon name class to lookup (scientific|any). [Default: scientific] + --taxon-lookup-in-memory Flag to use in-memory taxon name lookup. --taxon-spellcheck Flag to use fuzzy matching to match taxon names. --taxon-dir PATH Path to directory containing taxon-level data. --taxon-repo URL Remote git repository containing taxon-level data. @@ -79,6 +83,7 @@ from .hub import write_spellchecked_taxa from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids +from .taxon import load_taxon_table from .version import __version__ LOGGER = tolog.logger(__name__) @@ -101,7 +106,7 @@ def summarise_imported_taxa(docs, imported_taxa): yield entry_id, entry -def index_file(es, types, names, data, opts): +def index_file(es, types, names, data, opts, *, taxon_table=None): """Index a file.""" delimiters = {"csv": ",", "tsv": "\t"} rows = csv.reader( @@ -121,118 +126,113 @@ def index_file(es, types, names, data, opts): imported_rows = [] blanks = set(["", "NA", "N/A", "None"]) taxon_types = {} - for taxonomy_name in opts["taxonomy-source"]: - taxon_template = taxon.index_template(taxonomy_name, opts) - LOGGER.info("Processing rows") - for row in tqdm(rows): - try: - processed_data, taxon_data, new_taxon_types = process_row( - types, names, row + taxonomy_name = opts["taxonomy-source"][0] + taxon_template = taxon.index_template(taxonomy_name, opts) + LOGGER.info("Processing rows") + for row in tqdm(rows): + try: + processed_data, taxon_data, new_taxon_types = process_row(types, names, row) + except Exception as err: + print(err) + failed_rows["None"].append(row) + continue + taxon_types.update(new_taxon_types) + if not_blank("taxon_id", processed_data["taxonomy"], blanks): + with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data) + taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(taxon_data) + imported_rows.append(row) + else: + if "taxonomy" in types and not_blank( + "alt_taxon_id", processed_data["taxonomy"], blanks + ): + without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append( + processed_data ) - except Exception as err: - print(err) - failed_rows["None"].append(row) - continue - taxon_types.update(new_taxon_types) - if not_blank("taxon_id", processed_data["taxonomy"], blanks): - with_ids[processed_data["taxonomy"]["taxon_id"]].append(processed_data) - taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append( + taxon_asm_data[processed_data["taxonomy"]["alt_taxon_id"]].append( taxon_data ) - imported_rows.append(row) - else: - if "taxonomy" in types and not_blank( - "alt_taxon_id", processed_data["taxonomy"], blanks - ): - without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append( - processed_data - ) - taxon_asm_data[processed_data["taxonomy"]["alt_taxon_id"]].append( - taxon_data - ) - failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row) - elif not_blank("subspecies", processed_data["taxonomy"], blanks): - without_ids[processed_data["taxonomy"]["subspecies"]].append( - processed_data - ) - taxon_asm_data[processed_data["taxonomy"]["subspecies"]].append( - taxon_data - ) - failed_rows[processed_data["taxonomy"]["subspecies"]].append(row) - elif not_blank("species", processed_data["taxonomy"], blanks): - without_ids[processed_data["taxonomy"]["species"]].append( - processed_data - ) - taxon_asm_data[processed_data["taxonomy"]["species"]].append( - taxon_data - ) - failed_rows[processed_data["taxonomy"]["species"]].append(row) - else: - failed_rows["None"].append(row) - LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys())) - spellings = {} - create_ids, without_ids = fix_missing_ids( - es, - opts, - without_ids, - types=types, - taxon_template=taxon_template, - failed_rows=failed_rows, - imported_rows=imported_rows, - with_ids=with_ids, - blanks=blanks, - header=header, - spellings=spellings, - ) - write_spellchecked_taxa(spellings, opts, types=types) - if with_ids or create_ids: - write_imported_rows( - imported_rows, opts, types=types, header=header, label="imported" - ) - LOGGER.info("Indexing %d entries", len(with_ids.keys())) - if opts["index"] == "taxon": - docs = add_names_and_attributes_to_taxa( - es, dict(with_ids), opts, template=taxon_template, blanks=blanks + failed_rows[processed_data["taxonomy"]["alt_taxon_id"]].append(row) + elif not_blank("subspecies", processed_data["taxonomy"], blanks): + without_ids[processed_data["taxonomy"]["subspecies"]].append( + processed_data ) - imported_taxa = defaultdict(list) - index_stream( - es, - taxon_template["index_name"], - summarise_imported_taxa(docs, imported_taxa), - _op_type="update", - ) - write_imported_taxa(imported_taxa, opts, types=types) - elif opts["index"] == "assembly": - # TODO: keep track of taxon_id not found exceptions - assembly_template = assembly.index_template(taxonomy_name, opts) - docs = add_identifiers_and_attributes_to_assemblies( - es, - with_ids, - opts, - template=assembly_template, - taxon_template=taxon_template, - blanks=blanks, - ) - index_stream(es, assembly_template["index_name"], docs) - # index taxon-level attributes - index_types( - es, - "taxon", - {"attributes": taxon_types}, - opts, - ) - taxon_asm_with_ids = { - taxon_id: taxon_asm_data[taxon_id] for taxon_id in with_ids.keys() - } - taxon_docs = add_names_and_attributes_to_taxa( - es, taxon_asm_with_ids, opts, template=taxon_template, blanks=blanks + taxon_asm_data[processed_data["taxonomy"]["subspecies"]].append( + taxon_data ) - index_stream( - es, - taxon_template["index_name"], - taxon_docs, - _op_type="update", + failed_rows[processed_data["taxonomy"]["subspecies"]].append(row) + elif not_blank("species", processed_data["taxonomy"], blanks): + without_ids[processed_data["taxonomy"]["species"]].append( + processed_data ) + taxon_asm_data[processed_data["taxonomy"]["species"]].append(taxon_data) + failed_rows[processed_data["taxonomy"]["species"]].append(row) + else: + failed_rows["None"].append(row) + LOGGER.info("Found taxon IDs in %d entries", len(with_ids.keys())) + spellings = {"spellcheck": {}, "synonym": {}} + create_ids, without_ids = fix_missing_ids( + es, + opts, + without_ids, + types=types, + taxon_template=taxon_template, + failed_rows=failed_rows, + imported_rows=imported_rows, + with_ids=with_ids, + blanks=blanks, + header=header, + spellings=spellings, + taxon_table=taxon_table, + ) + write_spellchecked_taxa(spellings, opts, types=types) + if with_ids or create_ids: + write_imported_rows( + imported_rows, opts, types=types, header=header, label="imported" + ) + LOGGER.info("Indexing %d entries", len(with_ids.keys())) + if opts["index"] == "taxon": + docs = add_names_and_attributes_to_taxa( + es, dict(with_ids), opts, template=taxon_template, blanks=blanks + ) + imported_taxa = defaultdict(list) + index_stream( + es, + taxon_template["index_name"], + summarise_imported_taxa(docs, imported_taxa), + _op_type="update", + ) + write_imported_taxa(imported_taxa, opts, types=types) + elif opts["index"] == "assembly": + # TODO: keep track of taxon_id not found exceptions + assembly_template = assembly.index_template(taxonomy_name, opts) + docs = add_identifiers_and_attributes_to_assemblies( + es, + with_ids, + opts, + template=assembly_template, + taxon_template=taxon_template, + blanks=blanks, + ) + index_stream(es, assembly_template["index_name"], docs) + # index taxon-level attributes + index_types( + es, + "taxon", + {"attributes": taxon_types}, + opts, + ) + taxon_asm_with_ids = { + taxon_id: taxon_asm_data[taxon_id] for taxon_id in with_ids.keys() + } + taxon_docs = add_names_and_attributes_to_taxa( + es, taxon_asm_with_ids, opts, template=taxon_template, blanks=blanks + ) + index_stream( + es, + taxon_template["index_name"], + taxon_docs, + _op_type="update", + ) def main(args): @@ -246,6 +246,14 @@ def main(args): with tolog.DisableLogger(): hub.post_search_scripts(es) + taxonomy_name = options["index"]["taxonomy-source"][0] + taxon_table = None + if taxon_table is None and "taxon-lookup-in-memory" in options["index"]: + taxon_table = { + "scientific": defaultdict(list), + "any": defaultdict(list), + } + load_taxon_table(es, options["index"], taxonomy_name, taxon_table) for index in list(["taxon", "assembly"]): data_dir = "%s-dir" % index if data_dir in options["index"]: @@ -260,6 +268,7 @@ def main(args): names, data, {**options["index"], "index": index, "index_types": index_types}, + taxon_table=taxon_table, ) for types_file in sorted(Path(dir_path).glob("*.types.yaml")): types, data, names = validate_types_file(types_file, dir_path) @@ -271,9 +280,9 @@ def main(args): names, data, {**options["index"], "index": index, "index_types": index_types}, + taxon_table=taxon_table, ) # TODO: #29 Implement alternate backbone taxonomies - taxonomy_name = options["index"]["taxonomy-source"][0] if "file" in options["index"]: index_files(es, options["index"]["file"], taxonomy_name, options["index"]) elif "file-metadata" in options["index"]: diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index ba93a25f..abb64846 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -12,6 +12,7 @@ from .es_functions import document_by_id from .es_functions import index_stream from .es_functions import query_value_template +from .es_functions import stream_template_search_results from .hub import add_attribute_values from .hub import chunks from .hub import index_templator @@ -76,13 +77,20 @@ def lookup_taxa_by_taxon_id(es, values, template, *, return_type="list"): def lookup_missing_taxon_ids( - es, without_ids, opts, *, with_ids=None, blanks=set(["NA", "None"]), spellings=None + es, + without_ids, + opts, + *, + with_ids=None, + blanks=set(["NA", "None"]), + spellings=None, + taxon_table=None, ): """Lookup taxon ID based on available taxonomic information.""" if with_ids is None: with_ids = {} if spellings is None: - spellings = {} + spellings = {"spellcheck": {}, "synonym": {}} # TODO: set this list from types file ranks = [ "subspecies", @@ -104,21 +112,39 @@ def lookup_missing_taxon_ids( for index, rank in enumerate(ranks): if rank not in obj["taxonomy"] or obj["taxonomy"][rank] in blanks: continue - taxon_ids, name_class = lookup_taxon( - es, obj["taxonomy"][rank], opts, rank=rank, spellings=spellings + taxa, name_class = lookup_taxon( + es, + obj["taxonomy"][rank], + opts, + rank=rank, + return_type="taxon", + spellings=spellings, + taxon_table=taxon_table, ) - if index == 1 and not taxon_ids: + if index == 1 and not taxa: break - if len(taxon_ids) == 1: - if taxon_ids[0] in with_ids: - with_ids[taxon_ids[0]].append(obj) + if len(taxa) == 1: + obj.update({"input_name": obj["taxonomy"][rank]}) + taxon = taxa[0]["_source"] + if obj["taxonomy"][rank] != taxon["scientific_name"]: + spellings["synonym"].update( + { + obj["taxonomy"][rank]: { + "matches": [taxon["scientific_name"]], + "taxon_id": taxon["taxon_id"], + "rank": rank, + } + } + ) + if taxon["taxon_id"] in with_ids: + with_ids[taxon["taxon_id"]].append(obj) else: obj["attributes"] = [obj["attributes"]] - with_ids[taxon_ids[0]] = [obj] + with_ids[taxon["taxon_id"]] = [obj] LOGGER.debug( "Matched %s with taxon_id %s", obj["taxonomy"][rank], - taxon_ids[0], + taxon["taxon_id"], ) found_keys.append(key) break @@ -170,6 +196,58 @@ def lookup_missing_taxon_ids( return with_ids, without_ids, found_ids +def stream_taxon_names(es, *, index, root=None, size=1000): + """Get entries by depth of root taxon.""" + if root is not None: + body = { + "id": "taxon_names_by_root", + "params": {"root": root}, + } + return stream_template_search_results(es, index=index, body=body, size=size) + body = { + "id": "taxon_names", + "params": {}, + } + return stream_template_search_results(es, index=index, body=body) + + +def load_taxon_table(es, opts, taxonomy_name, taxon_table): + """Load all taxa into memory for taxon name lookup and spellcheck.""" + LOGGER.info("Loading taxa into memory for taxon name lookup") + taxon_template = index_template(taxonomy_name, opts) + root = None + if "taxon-lookup-root" in opts: + root = opts["taxon-lookup-root"] + for node in tqdm( + stream_taxon_names(es, index=taxon_template["index_name"], root=root) + ): + lineage = {} + node_names = set() + try: + if "attributes" in node["_source"]: + attributes = node["_source"]["attributes"] + else: + attributes = [] + for anc in node["_source"]["lineage"]: + lineage.update({anc["taxon_rank"]: anc["scientific_name"]}) + taxon = { + "taxon_id": node["_source"]["taxon_id"], + "taxon_rank": node["_source"]["taxon_rank"], + "scientific_name": node["_source"]["scientific_name"], + "lineage": lineage, + "attributes": attributes, + } + taxon_table["scientific"][node["_source"]["scientific_name"]].append(taxon) + taxon_table["any"][node["_source"]["scientific_name"]].append(taxon) + node_names.add(node["_source"]["scientific_name"]) + for obj in node["_source"]["taxon_names"]: + if obj["name"] not in node_names: + node_names.add(obj["name"]) + taxon_table["any"][obj["name"]].append(taxon) + except KeyError: + pass + + def fix_missing_ids( es, opts, @@ -183,17 +261,24 @@ def fix_missing_ids( blanks=set(["NA", "None"]), header=None, spellings=None, + taxon_table=None, ): """Find or create taxon IDs for rows without.""" if with_ids is None: with_ids = {} if spellings is None: - spellings = {} + spellings = {"spellcheck": {}, "synonym": {}} if without_ids: # TODO: support multiple taxonomies LOGGER.info("Looking up %d missing taxon IDs", len(without_ids.keys())) with_ids, without_ids, found_ids = lookup_missing_taxon_ids( - es, without_ids, opts, with_ids=with_ids, blanks=blanks, spellings=spellings + es, + without_ids, + opts, + with_ids=with_ids, + blanks=blanks, + spellings=spellings, + taxon_table=taxon_table, ) # create new taxon IDs if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: @@ -436,7 +521,7 @@ def spellcheck_taxon(es, name, index, rank, taxonomy_index_template, opts, retur """Look up taxon name with fuzzy matching.""" taxon_suggest = { "id": "taxon_suggest", - "params": {"searchTerm": name, "max_errors": 3}, + "params": {"searchTerm": name}, } matches = None with tolog.DisableLogger(): @@ -503,19 +588,17 @@ def taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type): return taxa -def lookup_taxon( +def lookup_taxon_in_index( es, name, opts, *, - rank=None, - name_class="scientific", - return_type="taxon_id", - spellings=None, + rank, + name_class, + return_type, + spellings, ): - """Lookup taxon ID.""" - if spellings is None: - spellings = {} + """Lookup taxon in Elasticsearch index.""" template = index_template(opts["taxonomy-source"][0], opts) index = template["index_name"] body = { @@ -529,16 +612,63 @@ def lookup_taxon( es, name, index, rank, taxonomy_index_template, opts, return_type ) if matches: - spellings.update( + spellings["spellcheck"].update( {name: {"matches": matches, "taxon_id": taxon_id, "rank": rank}} ) return [], name_class - # Uncomment code blow to use suggestion in current import - # if matches and len(matches) == 1: - # body["params"].update({"taxon": matches[0]}) - # else: - # return [], name_class taxa = taxon_lookup(es, body, index, taxonomy_index_template, opts, return_type) + return taxa + + +def lookup_taxon_in_memory( + name, opts, *, rank, name_class, return_type, spellings, taxon_table +): + """Lookup taxon in memory.""" + taxa = [] + if name_class in taxon_table: + if name in taxon_table[name_class]: + for obj in taxon_table[name_class][name]: + if return_type == "taxon_id": + taxa.append(obj["taxon_id"]) + else: + taxa.append({"_source": {**obj}}) + return taxa + + +def lookup_taxon( + es, + name, + opts, + *, + rank=None, + name_class="scientific", + return_type="taxon_id", + spellings=None, + taxon_table=None, +): + """Lookup taxon ID.""" + if spellings is None: + spellings = {"spellcheck": {}, "synonym": {}} + if taxon_table is None or name_class == "spellcheck": + taxa = lookup_taxon_in_index( + es, + name, + opts, + rank=rank, + name_class=name_class, + return_type=return_type, + spellings=spellings, + ) + else: + taxa = lookup_taxon_in_memory( + name, + opts, + rank=rank, + name_class=name_class, + return_type=return_type, + spellings=spellings, + taxon_table=taxon_table, + ) if ( not taxa and opts["taxon-lookup"] == "any" @@ -552,6 +682,7 @@ def lookup_taxon( name_class="any", return_type=return_type, spellings=spellings, + taxon_table=taxon_table, ) if ( not taxa @@ -567,6 +698,7 @@ def lookup_taxon( name_class="spellcheck", return_type=return_type, spellings=spellings, + taxon_table=taxon_table, ) return taxa, name_class @@ -702,8 +834,7 @@ def create_taxa( ): """Create new taxa using alternate taxon IDs.""" if spellings is None: - spellings = {} - + spellings = {"spellcheck": {}, "synonym": {}} ancestors = {} matches = defaultdict(dict) pbar = tqdm(total=len(data.keys())) @@ -759,7 +890,8 @@ def create_taxa( ancestors.update({alt_taxon_id: taxa[0]}) break else: - #  find existing ancestral taxa within a lineage + # find existing ancestral taxa within a lineage + # TODO: make an in memory version of this lookup taxa = lookup_taxon_within_lineage( es, obj["taxonomy"][rank], diff --git a/src/genomehubs/templates/scripts/taxon_names.json b/src/genomehubs/templates/scripts/taxon_names.json new file mode 100644 index 00000000..19c9c128 --- /dev/null +++ b/src/genomehubs/templates/scripts/taxon_names.json @@ -0,0 +1,18 @@ +{ + "script": { + "lang": "mustache", + "source": { + "query": { + "match_all": {} + }, + "_source": [ + "taxon_id", + "taxon_rank", + "scientific_name", + "lineage.*", + "taxon_names.*", + "attributes" + ] + } + } +} diff --git a/src/genomehubs/templates/scripts/taxon_names_by_root.json b/src/genomehubs/templates/scripts/taxon_names_by_root.json new file mode 100644 index 00000000..0fc44f7b --- /dev/null +++ b/src/genomehubs/templates/scripts/taxon_names_by_root.json @@ -0,0 +1,26 @@ +{ + "script": { + "lang": "mustache", + "source": { + "query": { + "nested": { + "path": "lineage", + "query": { + "multi_match": { + "query": "{{root}}", + "fields": ["lineage.taxon_id", "lineage.scientific_name"] + } + } + } + }, + "_source": [ + "taxon_id", + "taxon_rank", + "scientific_name", + "lineage.*", + "taxon_names.*", + "attributes" + ] + } + } +}