Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rjchallis/issue44 #71

Merged
merged 6 commits into from
Apr 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 49 additions & 4 deletions src/genomehubs/lib/attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,63 @@ def index(es, group, attributes, opts, *, index_type="attribute"):
return template, stream


# def fetch_types(es, opts):
# """Fetch all existing types."""
# template = index_template(opts, index_type="attribute")
# body = {
# "id": "attribute_types",
# "params": {},
# }
# entries = stream_template_search_results(
# es, index=template["index_name"], body=body
# )
# return {entry["key"]: entry for entry in entries}


def add_attribute_sources(name, obj, attributes):
"""Generate a list of attribute sources."""
for key, value in attributes[name].items():
if key.startswith("source"):
if key in obj:
if not isinstance(obj[key], list):
obj[key] = [obj[key]]
obj[key].append(value)
else:
obj[key] = value


def index_types(es, types_name, types, opts):
"""Index types into Elasticsearch."""
# TODO: fetch existing types to allow new sources to add, not overwrite
try:
attributes = fetch_types(es, types_name, opts)
except Exception:
attributes = {}
if "attributes" in types:
if "defaults" in types and "attributes" in types["defaults"]:
for key, value in types["attributes"].items():
new_attributes = {}
existing_attributes = {}
for key, value in types["attributes"].items():
if "defaults" in types and "attributes" in types["defaults"]:
value = {**types["defaults"]["attributes"], **value}
types["attributes"][key] = value
# types["attributes"][key] = value
if key in attributes:
existing_attributes[key] = value
add_attribute_sources(key, value, attributes)
else:
new_attributes[key] = value
template, stream = index(
es, types_name, types["attributes"], opts, index_type="attribute"
es, types_name, new_attributes, opts, index_type="attribute"
)
template, update_stream = index(
es,
types_name,
existing_attributes,
opts,
index_type="attribute",
)
load_mapping(es, template["name"], template["mapping"])
index_stream(es, template["index_name"], stream)
index_stream(es, template["index_name"], update_stream, _op_type="update")
if "taxon_names" in types:
if "defaults" in types and "taxon_names" in types["defaults"]:
for key, value in types["names"].items():
Expand Down
14 changes: 10 additions & 4 deletions src/genomehubs/lib/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,9 @@ def add_attributes(
else:
attribute = {"identifier": validated, "class": key}
attribute.update(meta)
if source is not None:
if "source" in types[key]:
attribute.update({"source": types[key]["source"]})
elif source is not None:
attribute.update({"source": source})
attributes.append(attribute)
if attribute_values:
Expand Down Expand Up @@ -470,6 +472,7 @@ def validate_types_file(types_file, dir_path):
if key.startswith("display") or key.startswith("taxon"):
defaults["attributes"].update({key: value})
elif key.startswith("source"):
defaults["attributes"].update({key: value})
defaults["metadata"].update({key: value})
types.update({"defaults": defaults})
data = tofile.open_file_handle(Path(dir_path) / types["file"]["name"])
Expand Down Expand Up @@ -550,9 +553,12 @@ def process_row(types, names, row):
taxon_data = {}
taxon_types = {}
if "is_primary_value" in data["metadata"]:
data["metadata"]["is_primary_value"] = bool(
int(data["metadata"]["is_primary_value"])
)
try:
data["metadata"]["is_primary_value"] = bool(
int(data["metadata"]["is_primary_value"])
)
except ValueError:
data["metadata"]["is_primary_value"] = False
for attr_type in list(["attributes", "identifiers"]):
if attr_type in data and data[attr_type]:
(
Expand Down
80 changes: 75 additions & 5 deletions src/genomehubs/lib/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def parse_listing(listing, collection, opts):
return parsed


def refseq_organelle_parser(collections, opts, *args, **kwargs):
def refseq_organelle_parser(collections, opts):
"""Fetch and parse RefSeq organelle collections."""
parsed = []
if isinstance(collections, tuple):
Expand All @@ -189,8 +189,8 @@ def refseq_organelle_parser(collections, opts, *args, **kwargs):
def parse_ncbi_datasets_record(record, parsed):
"""Parse a single NCBI datasets record."""
obj = {}
for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, None)
for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, "None")
assemblyInfo = record.get("assemblyInfo", {})
for key in (
"assemblyLevel",
Expand All @@ -204,9 +204,15 @@ def parse_ncbi_datasets_record(record, parsed):
"submitter",
):
obj[key] = assemblyInfo.get(key, None)
if key == "refseqCategory":
if obj[key] == "representative genome":
obj["primaryValue"] = 1
else:
obj["primaryValue"] = None
if obj["refseqAssmAccession"] == "na":
obj["refseqAssmAccession"] = None
obj["refseqCategory"] = None
obj["primaryValue"] = None
annotationInfo = record.get("annotationInfo", {})
if annotationInfo:
annot = {}
Expand All @@ -233,13 +239,77 @@ def parse_ncbi_datasets_record(record, parsed):
parsed[obj["genbankAssmAccession"]] = obj


def ncbi_genome_parser(directory, opts, *args, **kwargs):
def ncbi_genome_parser(_params, opts, *, types=None, names=None):
"""Parse NCBI Datasets genome report."""
parsed = {}
with tofile.open_file_handle(
"%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
"%s/ncbi_dataset/data/assembly_data_report.jsonl" % opts["ncbi-datasets-genome"]
) as report:
for line in report:
record = ujson.loads(line)
parse_ncbi_datasets_record(record, parsed)
return [value for value in parsed.values()]


# def parse_ncbi_datasets_summary(record, parsed):
# """Parse a single NCBI datasets summary."""
# obj = {}
# return
# for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
# obj[key] = record.get(key, None)
# assemblyInfo = record.get("assemblyInfo", {})
# for key in ("assembly_category", "assembly_level"):
# obj[key] = assemblyInfo.get(key, None)
# # "assembly_accession": "GCF_900239965.1",
# if obj["refseqAssmAccession"] == "na":
# obj["refseqAssmAccession"] = None
# obj["refseqCategory"] = None
# annotationInfo = record.get("annotationInfo", {})
# if annotationInfo:
# annot = {}
# for key in ("name", "releaseDate", "reportUrl", "source"):
# annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
# if annot and "stats" in annotationInfo:
# geneCounts = annotationInfo["stats"].get("geneCounts", None)
# for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
# annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
# if obj["genbankAssmAccession"] in parsed:
# parsed[obj["genbankAssmAccession"]].update(annot)
# return
# obj.update(annot)
# bioprojects = []
# for lineage in assemblyInfo.get("bioprojectLineage", []):
# for bioproject in lineage["bioprojects"]:
# bioprojects.append(bioproject["accession"])
# obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
# assemblyStats = record.get("assemblyStats", {})
# obj.update(assemblyStats)
# wgsInfo = record.get("wgsInfo", {})
# for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
# obj[key] = wgsInfo.get(key, None)
# parsed[obj["genbankAssmAccession"]] = obj


# def ncbi_datasets_summary_parser(_params, opts):
# """Fetch and parse NCBI Datasets summary."""
# parsed = {}
# datasets = check_output(
# ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
# )
# data = ujson.loads(datasets)
# if "assemblies" not in data:
# LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
# print(data)
# sys.exit(1)
# for record in data["assemblies"]:
# parse_ncbi_datasets_summary(record, parsed)
# print(parsed)
# quit()
# # parsed = {}
# # with tofile.open_file_handle(
# # "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
# # ) as report:
# # for line in report:
# # record = ujson.loads(line)
# # parse_ncbi_datasets_record(record, parsed)
# return [value for value in parsed.values()]
7 changes: 7 additions & 0 deletions src/genomehubs/lib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
from .config import config
from .hub import load_types
from .hub import order_parsed_fields

# from .ncbi import ncbi_datasets_summary_parser
from .ncbi import ncbi_genome_parser
from .ncbi import refseq_organelle_parser
from .version import __version__
Expand All @@ -60,6 +62,11 @@
"params": None,
"types": "assembly",
},
# "ncbi-datasets-summary": {
# "func": ncbi_datasets_summary_parser,
# "params": None,
# "types": "assembly",
# },
"refseq-mitochondria": {
"func": refseq_organelle_parser,
"params": ("mitochondrion"),
Expand Down
63 changes: 57 additions & 6 deletions src/genomehubs/templates/assembly.types.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ file:
taxonomy:
taxon_id:
header: taxId
species:
header: speciesName
taxon:
header: organismName
names:
common_name:
header: commonName
Expand Down Expand Up @@ -63,7 +63,11 @@ attributes:
taxon_display_group: assembly
taxon_name: gene_count
taxon_key: gene_count
taxon_summary: median
taxon_summary:
- primary
- median
- min
- max
taxon_display_level: 2
taxon_bins:
min: 0
Expand Down Expand Up @@ -103,7 +107,9 @@ attributes:
taxon_name: sample_sex
taxon_key: sample_sex
taxon_display_name: Sample sex
taxon_summary: list
taxon_summary:
- primary
- list
taxon_display_level: 2
isolate:
display_level: 2
Expand All @@ -115,6 +121,14 @@ attributes:
header: assemblyLevel
display_name: Assembly level
type: keyword
taxon_display_group: assembly
taxon_name: assembly_level
taxon_key: assembly_level
taxon_display_name: Assembly level
taxon_summary:
- primary
- list
taxon_display_level: 2
assembly_type:
display_level: 2
header: assemblyType
Expand All @@ -133,10 +147,13 @@ attributes:
taxon_name: assembly_span
taxon_key: assembly_span
taxon_display_name: Assembly span
taxon_summary: median
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_traverse_limit: superkingdom
taxon_display_level: 1
taxon_bins:
min: 6
Expand All @@ -161,6 +178,22 @@ attributes:
header: contigN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: contig_n50
taxon_key: contig_n50
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
contig_l50:
display_group: metrics
display_level: 2
Expand All @@ -179,6 +212,22 @@ attributes:
header: scaffoldN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: scaffold_n50
taxon_key: scaffold_n50
taxon_summary:
- primary
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
scaffold_l50:
display_group: metrics
display_level: 2
Expand Down Expand Up @@ -242,3 +291,5 @@ attributes:
metadata:
source_slug:
header: genbankAssmAccession
is_primary_value:
header: primaryValue