Skip to content

Commit

Permalink
update default ncbi datasets config
Browse files Browse the repository at this point in the history
  • Loading branch information
rjchallis committed Apr 8, 2021
1 parent 943cc27 commit b03e3ca
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 75 deletions.
124 changes: 61 additions & 63 deletions src/genomehubs/lib/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@

import gzip
import re
import sys
from collections import Counter
from subprocess import check_output

import ujson
from Bio import SeqIO
Expand Down Expand Up @@ -191,7 +189,7 @@ def refseq_organelle_parser(collections, opts):
def parse_ncbi_datasets_record(record, parsed):
"""Parse a single NCBI datasets record."""
obj = {}
for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
for key in ("taxId", "organismName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, None)
assemblyInfo = record.get("assemblyInfo", {})
for key in (
Expand Down Expand Up @@ -247,65 +245,65 @@ def ncbi_genome_parser(directory, opts):
return [value for value in parsed.values()]


def parse_ncbi_datasets_summary(record, parsed):
"""Parse a single NCBI datasets summary."""
obj = {}
return
for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
obj[key] = record.get(key, None)
assemblyInfo = record.get("assemblyInfo", {})
for key in ("assembly_category", "assembly_level"):
obj[key] = assemblyInfo.get(key, None)
# "assembly_accession": "GCF_900239965.1",
if obj["refseqAssmAccession"] == "na":
obj["refseqAssmAccession"] = None
obj["refseqCategory"] = None
annotationInfo = record.get("annotationInfo", {})
if annotationInfo:
annot = {}
for key in ("name", "releaseDate", "reportUrl", "source"):
annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
if annot and "stats" in annotationInfo:
geneCounts = annotationInfo["stats"].get("geneCounts", None)
for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
if obj["genbankAssmAccession"] in parsed:
parsed[obj["genbankAssmAccession"]].update(annot)
return
obj.update(annot)
bioprojects = []
for lineage in assemblyInfo.get("bioprojectLineage", []):
for bioproject in lineage["bioprojects"]:
bioprojects.append(bioproject["accession"])
obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
assemblyStats = record.get("assemblyStats", {})
obj.update(assemblyStats)
wgsInfo = record.get("wgsInfo", {})
for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
obj[key] = wgsInfo.get(key, None)
parsed[obj["genbankAssmAccession"]] = obj
# def parse_ncbi_datasets_summary(record, parsed):
# """Parse a single NCBI datasets summary."""
# obj = {}
# return
# for key in ("taxId", "speciesName", "commonName", "isolate", "sex"):
# obj[key] = record.get(key, None)
# assemblyInfo = record.get("assemblyInfo", {})
# for key in ("assembly_category", "assembly_level"):
# obj[key] = assemblyInfo.get(key, None)
# # "assembly_accession": "GCF_900239965.1",
# if obj["refseqAssmAccession"] == "na":
# obj["refseqAssmAccession"] = None
# obj["refseqCategory"] = None
# annotationInfo = record.get("annotationInfo", {})
# if annotationInfo:
# annot = {}
# for key in ("name", "releaseDate", "reportUrl", "source"):
# annot["annotation%s" % key.capitalize()] = annotationInfo.get(key, None)
# if annot and "stats" in annotationInfo:
# geneCounts = annotationInfo["stats"].get("geneCounts", None)
# for key in ("nonCoding", "proteinCoding", "pseudogene", "total"):
# annot["geneCount%s" % key.capitalize()] = geneCounts.get(key, None)
# if obj["genbankAssmAccession"] in parsed:
# parsed[obj["genbankAssmAccession"]].update(annot)
# return
# obj.update(annot)
# bioprojects = []
# for lineage in assemblyInfo.get("bioprojectLineage", []):
# for bioproject in lineage["bioprojects"]:
# bioprojects.append(bioproject["accession"])
# obj["bioProjectAccession"] = ";".join(bioprojects) if bioprojects else None
# assemblyStats = record.get("assemblyStats", {})
# obj.update(assemblyStats)
# wgsInfo = record.get("wgsInfo", {})
# for key in ("masterWgsUrl", "wgsContigsUrl", "wgsProjectAccession"):
# obj[key] = wgsInfo.get(key, None)
# parsed[obj["genbankAssmAccession"]] = obj


def ncbi_datasets_summary_parser(_params, opts):
"""Fetch and parse NCBI Datasets summary."""
parsed = {}
datasets = check_output(
["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
)
data = ujson.loads(datasets)
if "assemblies" not in data:
LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
print(data)
sys.exit(1)
for record in data["assemblies"]:
parse_ncbi_datasets_summary(record, parsed)
print(parsed)
quit()
# parsed = {}
# with tofile.open_file_handle(
# "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
# ) as report:
# for line in report:
# record = ujson.loads(line)
# parse_ncbi_datasets_record(record, parsed)
return [value for value in parsed.values()]
# def ncbi_datasets_summary_parser(_params, opts):
# """Fetch and parse NCBI Datasets summary."""
# parsed = {}
# datasets = check_output(
# ["datasets", "summary", "genome", "taxon", opts["ncbi-datasets-summary"]]
# )
# data = ujson.loads(datasets)
# if "assemblies" not in data:
# LOGGER.error("unable to fetch assemblies for %s", opts["ncbi-datasets-summary"])
# print(data)
# sys.exit(1)
# for record in data["assemblies"]:
# parse_ncbi_datasets_summary(record, parsed)
# print(parsed)
# quit()
# # parsed = {}
# # with tofile.open_file_handle(
# # "%s/ncbi_dataset/data/assembly_data_report.jsonl" % directory
# # ) as report:
# # for line in report:
# # record = ujson.loads(line)
# # parse_ncbi_datasets_record(record, parsed)
# return [value for value in parsed.values()]
15 changes: 7 additions & 8 deletions src/genomehubs/lib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
genomehubs parse [--btk] [--btk-root STRING...]
[--wikidata PATH] [--wikidata-root STRING...] [--wikidata-xref STRING...]
[--gbif] [--gbif-root STRING...] [--gbif-xref STRING...]
[--ncbi-datasets-summary INT]
[--ncbi-datasets-genome PATH] [--outfile PATH]
[--refseq-mitochondria] [--refseq-organelles]
[--refseq-plastids] [--refseq-root NAME]
Expand All @@ -22,7 +21,6 @@
--wikidata PATH Parse taxa in WikiData dump
--wikidata-root STRING WikiData taxon ID of root taxon
--wikidata-xref STRING Include link to external reference from WikiData (e.g. NBN, BOLD)
--ncbi-datasets-summary INT Fetch and parse NCBI Datasets summary for a root taxId
--ncbi-datasets-genome PATH Parse NCBI Datasets genome directory
--outfile PATH Save parsed output to file
--refseq-mitochondria Parse mitochondrial genomes from the NCBI RefSeq
Expand All @@ -49,7 +47,8 @@
from .gbif import gbif_parser
from .hub import load_types
from .hub import order_parsed_fields
from .ncbi import ncbi_datasets_summary_parser

# from .ncbi import ncbi_datasets_summary_parser
from .ncbi import ncbi_genome_parser
from .ncbi import refseq_organelle_parser
from .version import __version__
Expand All @@ -65,11 +64,11 @@
"params": None,
"types": "assembly",
},
"ncbi-datasets-summary": {
"func": ncbi_datasets_summary_parser,
"params": None,
"types": "assembly",
},
# "ncbi-datasets-summary": {
# "func": ncbi_datasets_summary_parser,
# "params": None,
# "types": "assembly",
# },
"refseq-mitochondria": {
"func": refseq_organelle_parser,
"params": ("mitochondrion"),
Expand Down
40 changes: 36 additions & 4 deletions src/genomehubs/templates/assembly.types.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ file:
taxonomy:
taxon_id:
header: taxId
species:
header: speciesName
taxon:
header: organismName
names:
common_name:
header: commonName
Expand Down Expand Up @@ -133,10 +133,12 @@ attributes:
taxon_name: assembly_span
taxon_key: assembly_span
taxon_display_name: Assembly span
taxon_summary: median
taxon_summary:
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_traverse_limit: superkingdom
taxon_display_level: 1
taxon_bins:
min: 6
Expand All @@ -161,6 +163,21 @@ attributes:
header: contigN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: contig_n50
taxon_key: contig_n50
taxon_summary:
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
contig_l50:
display_group: metrics
display_level: 2
Expand All @@ -179,6 +196,21 @@ attributes:
header: scaffoldN50
type: long
units: bases
taxon_display_group: assembly
taxon_name: scaffold_n50
taxon_key: scaffold_n50
taxon_summary:
- median
- min
- max
taxon_traverse: median
taxon_traverse_direction: both
taxon_display_level: 2
taxon_bins:
min: 4
max: 9
count: 10
scale: log10
scaffold_l50:
display_group: metrics
display_level: 2
Expand Down

0 comments on commit b03e3ca

Please sign in to comment.