Skip to content

Commit

Permalink
Merge pull request #71 from genomehubs:rjchallis/issue44
Browse files Browse the repository at this point in the history
rjchallis/issue44
  • Loading branch information
rjchallis committed Apr 9, 2021
2 parents f8897a0 + 4f36507 commit 7eed9ff
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 19 deletions.
53 changes: 49 additions & 4 deletions src/genomehubs/lib/attributes.py
Expand Up @@ -37,18 +37,63 @@ def index(es, group, attributes, opts, *, index_type="attribute"):
return template, stream


# def fetch_types(es, opts):
# """Fetch all existing types."""
# template = index_template(opts, index_type="attribute")
# body = {
# "id": "attribute_types",
# "params": {},
# }
# entries = stream_template_search_results(
# es, index=template["index_name"], body=body
# )
# return {entry["key"]: entry for entry in entries}


def add_attribute_sources(name, obj, attributes):
"""Generate a list of attribute sources."""
for key, value in attributes[name].items():
if key.startswith("source"):
if key in obj:
if not isinstance(obj[key], list):
obj[key] = [obj[key]]
obj[key].append(value)
else:
obj[key] = value


def index_types(es, types_name, types, opts):
"""Index types into Elasticsearch."""
# TODO: fetch existing types to allow new sources to add, not overwrite
try:
attributes = fetch_types(es, types_name, opts)
except Exception:
attributes = {}
if "attributes" in types:
if "defaults" in types and "attributes" in types["defaults"]:
for key, value in types["attributes"].items():
new_attributes = {}
existing_attributes = {}
for key, value in types["attributes"].items():
if "defaults" in types and "attributes" in types["defaults"]:
value = {**types["defaults"]["attributes"], **value}
types["attributes"][key] = value
# types["attributes"][key] = value
if key in attributes:
existing_attributes[key] = value
add_attribute_sources(key, value, attributes)
else:
new_attributes[key] = value
template, stream = index(
es, types_name, types["attributes"], opts, index_type="attribute"
es, types_name, new_attributes, opts, index_type="attribute"
)
template, update_stream = index(
es,
types_name,
existing_attributes,
opts,
index_type="attribute",
)
load_mapping(es, template["name"], template["mapping"])
index_stream(es, template["index_name"], stream)
index_stream(es, template["index_name"], update_stream, _op_type="update")
if "taxon_names" in types:
if "defaults" in types and "taxon_names" in types["defaults"]:
for key, value in types["names"].items():
Expand Down
14 changes: 10 additions & 4 deletions src/genomehubs/lib/hub.py
Expand Up @@ -356,7 +356,9 @@ def add_attributes(
else:
attribute = {"identifier": validated, "class": key}
attribute.update(meta)
if source is not None:
if "source" in types[key]:
attribute.update({"source": types[key]["source"]})
elif source is not None:
attribute.update({"source": source})
attributes.append(attribute)
if attribute_values:
Expand Down Expand Up @@ -470,6 +472,7 @@ def validate_types_file(types_file, dir_path):
if key.startswith("display") or key.startswith("taxon"):
defaults["attributes"].update({key: value})
elif key.startswith("source"):
defaults["attributes"].update({key: value})
defaults["metadata"].update({key: value})
types.update({"defaults": defaults})
data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"])
Expand Down Expand Up @@ -550,9 +553,12 @@ def process_row(types, names, row):
taxon_data = {}
taxon_types = {}
if "is_primary_value" in data["metadata"]:
data["metadata"]["is_primary_value"] = bool(
int(data["metadata"]["is_primary_value"])
)
try:
data["metadata"]["is_primary_value"] = bool(
int(data["metadata"]["is_primary_value"])
)
except ValueError:
data["metadata"]["is_primary_value"] = False
for attr_type in list(["attributes", "identifiers"]):
if attr_type in data and data[attr_type]:
(
Expand Down
80 changes: 75 additions & 5 deletions src/genomehubs/lib/ncbi.py
Expand Up @@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts):
return parsed


def refseq_organelle_parser(collections, opts, *args, **kwargs):
def refseq_organelle_parser(collections, opts):
"""Fetch and parse RefSeq organelle collections."""
parsed = []
if isinstance(collections, tuple):
Expand All @@ -189,8 +189,8 @@ def refseq_organelle_parser(collections, opts, *args, **kwargs):
def parse_ncbi_datasets_record(record, parsed):
"""Parse a single NCBI datasets record."""
obj = {}
for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, None)
for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, "None")
assemblyInfo = record.get("assemblyInfo", {})
for key in (
"assemblyLevel",
Expand All @@ -204,9 +204,15 @@ def parse_ncbi_datasets_record(record, parsed):
"submitter",
):
obj[key] = assemblyInfo.get(key, None)
if key == "refseqCategory":
if obj[key] == "representative genome":
obj["primaryValue"] = 1
else:
obj["primaryValue"] = None
if obj["refseqAssmAccession"] == "na":
obj["refseqAssmAccession"] = None
obj["refseqCategory"] = None
obj["primaryValue"] = None
annotationInfo = record.get("annotationInfo", {})
if annotationInfo:
annot = {}
Expand All @@ -233,13 +239,77 @@ def parse_ncbi_datasets_record(record, parsed):
parsed[obj["genbankAssmAccession"]] = obj


def ncbi_genome_parser(directory, opts, *args, **kwargs):
def ncbi_genome_parser(_params, opts, *, types=None, names=None):
"""Parse NCBI Datasets genome report."""
parsed = {}
with tofile.open_file_handle(
"%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
"%s/ncbi_dataset/data/assembly_data_report.jsonl" % opts["ncbi-datasets-genome"]
) as report:
for line in report:
record = ujson.loads(line)
parse_ncbi_datasets_record(record, parsed)
return [value for value in parsed.values()]


# def parse_ncbi_datasets_summary(record, parsed):
# """Parse a single NCBI datasets summary."""
# obj = {}
# return
# for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
# obj[key] = record.get(key, None)
# assemblyInfo = record.get("assemblyInfo", {})
# for key in ("assembly_category", "assembly_level"):
# obj[key] = assemblyInfo.get(key, None)
# # "assembly_accession": "GCF_900239965.1",
# if obj["refseqAssmAccession"] == "na":
# obj["refseqAssmAccession"] = None
# obj["refseqCategory"] = None
# annotationInfo = record.get("annotationInfo", {})
# if annotationInfo:
# annot = {}
# for key in ("name", "releaseDate", "reportUrl", "source"):
# annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
# if annot and "stats" in annotationInfo:
# geneCounts = annotationInfo["stats"].get("geneCounts", None)
# for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
# annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
# if obj["genbankAssmAccession"] in parsed:
# parsed[obj["genbankAssmAccession"]].update(annot)
# return
# obj.update(annot)
# bioprojects = []
# for lineage in assemblyInfo.get("bioprojectLineage", []):
# for bioproject in lineage["bioprojects"]:
# bioprojects.append(bioproject["accession"])
# obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
# assemblyStats = record.get("assemblyStats", {})
# obj.update(assemblyStats)
# wgsInfo = record.get("wgsInfo", {})
# for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
# obj[key] = wgsInfo.get(key, None)
# parsed[obj["genbankAssmAccession"]] = obj


# def ncbi_datasets_summary_parser(_params, opts):
# """Fetch and parse NCBI Datasets summary."""
# parsed = {}
# datasets = check_output(
# ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
# )
# data = ujson.loads(datasets)
# if "assemblies" not in data:
# LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
# print(data)
# sys.exit(1)
# for record in data["assemblies"]:
# parse_ncbi_datasets_summary(record, parsed)
# print(parsed)
# quit()
# # parsed = {}
# # with tofile.open_file_handle(
# # "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
# # ) as report:
# # for line in report:
# # record = ujson.loads(line)
# # parse_ncbi_datasets_record(record, parsed)
# return [value for value in parsed.values()]
7 changes: 7 additions & 0 deletions src/genomehubs/lib/parse.py
Expand Up @@ -46,6 +46,8 @@
from .config import config
from .hub import load_types
from .hub import order_parsed_fields

# from .ncbi import ncbi_datasets_summary_parser
from .ncbi import ncbi_genome_parser
from .ncbi import refseq_organelle_parser
from .version import __version__
Expand All @@ -60,6 +62,11 @@
"params": None,
"types": "assembly",
},
# "ncbi-datasets-summary": {
# "func": ncbi_datasets_summary_parser,
# "params": None,
# "types": "assembly",
# },
"refseq-mitochondria": {
"func": refseq_organelle_parser,
"params": ("mitochondrion"),
Expand Down
63 changes: 57 additions & 6 deletions src/genomehubs/templates/assembly.types.yaml
Expand Up @@ -8,8 +8,8 @@ file:
taxonomy:
taxon_id:
header: taxId
species:
header: speciesName
taxon:
header: organismName
names:
common_name:
header: commonName
Expand Down Expand Up @@ -63,7 +63,11 @@ attributes:
taxon_display_group: assembly
taxon_name: gene_count
taxon_key: gene_count
taxon_summary: median
taxon_summary:
- primary
- median
- min
- max
taxon_display_level: 2
taxon_bins:
min: 0
Expand Down Expand Up @@ -103,7 +107,9 @@ attributes:
taxon_name: sample_sex
taxon_key: sample_sex
taxon_display_name: Sample sex
taxon_summary: list
taxon_summary:
- primary
- list
taxon_display_level: 2
isolate:
display_level: 2
Expand All @@ -115,6 +121,14 @@ attributes:
header: assemblyLevel
display_name: Assembly level
type: keyword
taxon_display_group: assembly
taxon_name: assembly_level
taxon_key: assembly_level
taxon_display_name: Assembly level
taxon_summary:
- primary
- list
taxon_display_level: 2
assembly_type:
display_level: 2
header: assemblyType
Expand All @@ -133,10 +147,13 @@ attributes:
taxon_name: assembly_span
taxon_key: assembly_span
taxon_display_name: Assembly span
taxon_summary: median
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_traverse_limit: superkingdom
taxon_display_level: 1
taxon_bins:
min: 6
Expand All @@ -161,6 +178,22 @@ attributes:
header: contigN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: contig_n50
taxon_key: contig_n50
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
contig_l50:
display_group: metrics
display_level: 2
Expand All @@ -179,6 +212,22 @@ attributes:
header: scaffoldN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: scaffold_n50
taxon_key: scaffold_n50
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
scaffold_l50:
display_group: metrics
display_level: 2
Expand Down Expand Up @@ -242,3 +291,5 @@ attributes:
metadata:
source_slug:
header: genbankAssmAccession
is_primary_value:
header: primaryValue

0 comments on commit 7eed9ff

Please sign in to comment.