diff --git a/src/genomehubs/lib/attributes.py b/src/genomehubs/lib/attributes.py index aefc5e61..0d191e91 100644 --- a/src/genomehubs/lib/attributes.py +++ b/src/genomehubs/lib/attributes.py @@ -37,18 +37,63 @@ def index(es, group, attributes, opts, *, index_type="attribute"): return template, stream +# def fetch_types(es, opts): +# """Fetch all existing types.""" +# template = index_template(opts, index_type="attribute") +# body = { +# "id": "attribute_types", +# "params": {}, +# } +# entries = stream_template_search_results( +# es, index=template["index_name"], body=body +# ) +# return {entry["key"]: entry for entry in entries} + + +def add_attribute_sources(name, obj, attributes): + """Generate a list of attribute sources.""" + for key, value in attributes[name].items(): + if key.startswith("source"): + if key in obj: + if not isinstance(obj[key], list): + obj[key] = [obj[key]] + obj[key].append(value) + else: + obj[key] = value + + def index_types(es, types_name, types, opts): """Index types into Elasticsearch.""" + # TODO: fetch existing types to allow new sources to add, not overwrite + try: + attributes = fetch_types(es, types_name, opts) + except Exception: + attributes = {} if "attributes" in types: - if "defaults" in types and "attributes" in types["defaults"]: - for key, value in types["attributes"].items(): + new_attributes = {} + existing_attributes = {} + for key, value in types["attributes"].items(): + if "defaults" in types and "attributes" in types["defaults"]: value = {**types["defaults"]["attributes"], **value} - types["attributes"][key] = value + # types["attributes"][key] = value + if key in attributes: + existing_attributes[key] = value + add_attribute_sources(key, value, attributes) + else: + new_attributes[key] = value template, stream = index( - es, types_name, types["attributes"], opts, index_type="attribute" + es, types_name, new_attributes, opts, index_type="attribute" + ) + template, update_stream = index( + es, + types_name, + existing_attributes, + opts, + index_type="attribute", ) load_mapping(es, template["name"], template["mapping"]) index_stream(es, template["index_name"], stream) + index_stream(es, template["index_name"], update_stream, _op_type="update") if "taxon_names" in types: if "defaults" in types and "taxon_names" in types["defaults"]: for key, value in types["names"].items(): diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index a068f9a1..6950ddd8 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -356,7 +356,9 @@ def add_attributes( else: attribute = {"identifier": validated, "class": key} attribute.update(meta) - if source is not None: + if "source" in types[key]: + attribute.update({"source": types[key]["source"]}) + elif source is not None: attribute.update({"source": source}) attributes.append(attribute) if attribute_values: @@ -470,6 +472,7 @@ def validate_types_file(types_file, dir_path): if key.startswith("display") or key.startswith("taxon"): defaults["attributes"].update({key: value}) elif key.startswith("source"): + defaults["attributes"].update({key: value}) defaults["metadata"].update({key: value}) types.update({"defaults": defaults}) data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"]) @@ -550,9 +553,12 @@ def process_row(types, names, row): taxon_data = {} taxon_types = {} if "is_primary_value" in data["metadata"]: - data["metadata"]["is_primary_value"] = bool( - int(data["metadata"]["is_primary_value"]) - ) + try: + data["metadata"]["is_primary_value"] = bool( + int(data["metadata"]["is_primary_value"]) + ) + except ValueError: + data["metadata"]["is_primary_value"] = False for attr_type in list(["attributes", "identifiers"]): if attr_type in data and data[attr_type]: ( diff --git a/src/genomehubs/lib/ncbi.py b/src/genomehubs/lib/ncbi.py index 404409ae..32b4c5ec 100644 --- a/src/genomehubs/lib/ncbi.py +++ b/src/genomehubs/lib/ncbi.py @@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts): return parsed -def refseq_organelle_parser(collections, opts, *args, **kwargs): +def refseq_organelle_parser(collections, opts): """Fetch and parse RefSeq organelle collections.""" parsed = [] if isinstance(collections, tuple): @@ -189,8 +189,8 @@ def refseq_organelle_parser(collections, opts, *args, **kwargs): def parse_ncbi_datasets_record(record, parsed): """Parse a single NCBI datasets record.""" obj = {} - for key in ("taxId", "speciesName", "commonName", "isolate", "sex"): - obj[key] = record.get(key, None) + for key in ("taxId", "organismName", "commonName", "isolate", "sex"): + obj[key] = record.get(key, "None") assemblyInfo = record.get("assemblyInfo", {}) for key in ( "assemblyLevel", @@ -204,9 +204,15 @@ def parse_ncbi_datasets_record(record, parsed): "submitter", ): obj[key] = assemblyInfo.get(key, None) + if key == "refseqCategory": + if obj[key] == "representative genome": + obj["primaryValue"] = 1 + else: + obj["primaryValue"] = None if obj["refseqAssmAccession"] == "na": obj["refseqAssmAccession"] = None obj["refseqCategory"] = None + obj["primaryValue"] = None annotationInfo = record.get("annotationInfo", {}) if annotationInfo: annot = {} @@ -233,13 +239,77 @@ def parse_ncbi_datasets_record(record, parsed): parsed[obj["genbankAssmAccession"]] = obj -def ncbi_genome_parser(directory, opts, *args, **kwargs): +def ncbi_genome_parser(_params, opts, *, types=None, names=None): """Parse NCBI Datasets genome report.""" parsed = {} with tofile.open_file_handle( - "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory + "%s/ncbi_dataset/data/assembly_data_report.jsonl" % opts["ncbi-datasets-genome"] ) as report: for line in report: record = ujson.loads(line) parse_ncbi_datasets_record(record, parsed) return [value for value in parsed.values()] + + +# def parse_ncbi_datasets_summary(record, parsed): +# """Parse a single NCBI datasets summary.""" +# obj = {} +# return +# for key in ("taxId", "speciesName", "commonName", "isolate", "sex"): +# obj[key] = record.get(key, None) +# assemblyInfo = record.get("assemblyInfo", {}) +# for key in ("assembly_category", "assembly_level"): +# obj[key] = assemblyInfo.get(key, None) +# # "assembly_accession": "GCF_900239965.1", +# if obj["refseqAssmAccession"] == "na": +# obj["refseqAssmAccession"] = None +# obj["refseqCategory"] = None +# annotationInfo = record.get("annotationInfo", {}) +# if annotationInfo: +# annot = {} +# for key in ("name", "releaseDate", "reportUrl", "source"): +# annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None) +# if annot and "stats" in annotationInfo: +# geneCounts = annotationInfo["stats"].get("geneCounts", None) +# for key in ("nonCoding", "proteinCoding", "pseudogene", "total"): +# annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None) +# if obj["genbankAssmAccession"] in parsed: +# parsed[obj["genbankAssmAccession"]].update(annot) +# return +# obj.update(annot) +# bioprojects = [] +# for lineage in assemblyInfo.get("bioprojectLineage", []): +# for bioproject in lineage["bioprojects"]: +# bioprojects.append(bioproject["accession"]) +# obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None +# assemblyStats = record.get("assemblyStats", {}) +# obj.update(assemblyStats) +# wgsInfo = record.get("wgsInfo", {}) +# for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"): +# obj[key] = wgsInfo.get(key, None) +# parsed[obj["genbankAssmAccession"]] = obj + + +# def ncbi_datasets_summary_parser(_params, opts): +# """Fetch and parse NCBI Datasets summary.""" +# parsed = {} +# datasets = check_output( +# ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]] +# ) +# data = ujson.loads(datasets) +# if "assemblies" not in data: +# LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"]) +# print(data) +# sys.exit(1) +# for record in data["assemblies"]: +# parse_ncbi_datasets_summary(record, parsed) +# print(parsed) +# quit() +# # parsed = {} +# # with tofile.open_file_handle( +# # "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory +# # ) as report: +# # for line in report: +# # record = ujson.loads(line) +# # parse_ncbi_datasets_record(record, parsed) +# return [value for value in parsed.values()] diff --git a/src/genomehubs/lib/parse.py b/src/genomehubs/lib/parse.py index 758c2495..dc0b34a8 100644 --- a/src/genomehubs/lib/parse.py +++ b/src/genomehubs/lib/parse.py @@ -46,6 +46,8 @@ from .config import config from .hub import load_types from .hub import order_parsed_fields + +# from .ncbi import ncbi_datasets_summary_parser from .ncbi import ncbi_genome_parser from .ncbi import refseq_organelle_parser from .version import __version__ @@ -60,6 +62,11 @@ "params": None, "types": "assembly", }, + # "ncbi-datasets-summary": { + # "func": ncbi_datasets_summary_parser, + # "params": None, + # "types": "assembly", + # }, "refseq-mitochondria": { "func": refseq_organelle_parser, "params": ("mitochondrion"), diff --git a/src/genomehubs/templates/assembly.types.yaml b/src/genomehubs/templates/assembly.types.yaml index cb716fb8..22228938 100644 --- a/src/genomehubs/templates/assembly.types.yaml +++ b/src/genomehubs/templates/assembly.types.yaml @@ -8,8 +8,8 @@ file: taxonomy: taxon_id: header: taxId - species: - header: speciesName + taxon: + header: organismName names: common_name: header: commonName @@ -63,7 +63,11 @@ attributes: taxon_display_group: assembly taxon_name: gene_count taxon_key: gene_count - taxon_summary: median + taxon_summary: + - primary + - median + - min + - max taxon_display_level: 2 taxon_bins: min: 0 @@ -103,7 +107,9 @@ attributes: taxon_name: sample_sex taxon_key: sample_sex taxon_display_name: Sample sex - taxon_summary: list + taxon_summary: + - primary + - list taxon_display_level: 2 isolate: display_level: 2 @@ -115,6 +121,14 @@ attributes: header: assemblyLevel display_name: Assembly level type: keyword + taxon_display_group: assembly + taxon_name: assembly_level + taxon_key: assembly_level + taxon_display_name: Assembly level + taxon_summary: + - primary + - list + taxon_display_level: 2 assembly_type: display_level: 2 header: assemblyType @@ -133,10 +147,13 @@ attributes: taxon_name: assembly_span taxon_key: assembly_span taxon_display_name: Assembly span - taxon_summary: median + taxon_summary: + - primary + - median + - min + - max taxon_traverse: median taxon_traverse_direction: both - taxon_traverse_limit: superkingdom taxon_display_level: 1 taxon_bins: min: 6 @@ -161,6 +178,22 @@ attributes: header: contigN50 type: long units: bases + taxon_display_group: assembly + taxon_name: contig_n50 + taxon_key: contig_n50 + taxon_summary: + - primary + - median + - min + - max + taxon_traverse: median + taxon_traverse_direction: both + taxon_display_level: 2 + taxon_bins: + min: 4 + max: 9 + count: 10 + scale: log10 contig_l50: display_group: metrics display_level: 2 @@ -179,6 +212,22 @@ attributes: header: scaffoldN50 type: long units: bases + taxon_display_group: assembly + taxon_name: scaffold_n50 + taxon_key: scaffold_n50 + taxon_summary: + - primary + - median + - min + - max + taxon_traverse: median + taxon_traverse_direction: both + taxon_display_level: 2 + taxon_bins: + min: 4 + max: 9 + count: 10 + scale: log10 scaffold_l50: display_group: metrics display_level: 2 @@ -242,3 +291,5 @@ attributes: metadata: source_slug: header: genbankAssmAccession + is_primary_value: + header: primaryValue