Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/genomehubs/genomehubs into …
Browse files Browse the repository at this point in the history
…main
  • Loading branch information
rjchallis committed Mar 3, 2021
2 parents c6c9c04 + b46fc32 commit cdc7962
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 21 deletions.
28 changes: 28 additions & 0 deletions src/genomehubs/lib/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,3 +503,31 @@ def set_column_indices(types, header):
index = headers.get(value["header"], None)
if index is not None:
value.update({"index": index})


def write_imported_rows(rows, opts, *, types, header=None, label="imported"):
"""Write imported rows to processed file."""
file_key = "%s-exception" % opts["index"]
dir_key = "%s-dir" % opts["index"]
if file_key in opts and opts[file_key]:
outdir = opts[file_key]
else:
outdir = "%s/%s" % (opts[dir_key], label)
os.makedirs(outdir, exist_ok=True)
outfile = "%s/%s" % (outdir, types["file"]["name"])
data = []
header_len = 0
if header is not None:
data.append(header)
header_len = 1
if isinstance(rows, dict):
for row_set in rows.values():
for row in row_set:
data.append(row)
else:
for row in rows:
data.append(row)
LOGGER.info(
"Writing %d records to %s file '%s", len(data) - header_len, label, outfile
)
tofile.write_file(outfile, data)
13 changes: 11 additions & 2 deletions src/genomehubs/lib/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
from .hub import process_row
from .hub import set_column_indices
from .hub import validate_types_file
from .hub import write_imported_rows
from .taxon import add_names_and_attributes_to_taxa
from .taxon import fix_missing_ids
from .version import __version__
Expand All @@ -84,14 +85,16 @@ def index_file(es, types, data, opts):
rows = csv.reader(
data, delimiter=delimiters[types["file"]["format"]], quotechar='"'
)
header = None
if types["file"].get("header", False):
if "header" in types["file"] and types["file"]["header"]:
header = next(rows)
set_column_indices(types, header)
else:
header = None
with_ids = defaultdict(list)
taxon_asm_data = defaultdict(list)
without_ids = defaultdict(list)
failed_rows = defaultdict(list)
imported_rows = []
blanks = set(["", "NA", "N/A", "None"])
taxon_types = {}
for taxonomy_name in opts["taxonomy-source"]:
Expand All @@ -112,6 +115,7 @@ def index_file(es, types, data, opts):
taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append(
taxon_data
)
imported_rows.append(row)
else:
if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]:
without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append(
Expand Down Expand Up @@ -147,11 +151,15 @@ def index_file(es, types, data, opts):
types=types,
taxon_template=taxon_template,
failed_rows=failed_rows,
imported_rows=imported_rows,
with_ids=with_ids,
blanks=blanks,
header=header,
)
if with_ids or create_ids:
write_imported_rows(
imported_rows, opts, types=types, header=header, label="imported"
)
LOGGER.info("Indexing %d entries", len(with_ids.keys()))
if opts["index"] == "taxon":
docs = add_names_and_attributes_to_taxa(
Expand All @@ -164,6 +172,7 @@ def index_file(es, types, data, opts):
_op_type="update",
)
elif opts["index"] == "assembly":
# TODO: keep track of taxon_id not found exceptions
assembly_template = assembly.index_template(taxonomy_name, opts)
docs = add_identifiers_and_attributes_to_assemblies(
es,
Expand Down
24 changes: 5 additions & 19 deletions src/genomehubs/lib/taxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

"""Taxon methods."""

import os
import sys
from collections import defaultdict

from tolkein import tofile
from tolkein import tolog
from tqdm import tqdm

Expand All @@ -17,6 +15,7 @@
from .hub import add_attribute_values
from .hub import chunks
from .hub import index_templator
from .hub import write_imported_rows
from .taxonomy import index_template as taxonomy_index_template

LOGGER = tolog.logger(__name__)
Expand Down Expand Up @@ -177,6 +176,7 @@ def fix_missing_ids(
types,
taxon_template,
failed_rows,
imported_rows,
with_ids=None,
blanks=set(["NA", "None"]),
header=None,
Expand Down Expand Up @@ -211,29 +211,15 @@ def fix_missing_ids(
if without_ids and failed_rows:
for key, value in found_ids.items():
if key in failed_rows:
imported_rows += failed_rows[key]
del failed_rows[key]
if failed_rows:
LOGGER.info(
"Unable to associate %d records with taxon IDs", len(failed_rows)
)
data = []
exception_key = "%s-exception" % opts["index"]
dir_key = "%s-dir" % opts["index"]
if exception_key in opts and opts[exception_key]:
outdir = opts[exception_key]
else:
outdir = "%s/exceptions" % opts[dir_key]
os.makedirs(outdir, exist_ok=True)
outfile = "%s/%s" % (outdir, types["file"]["name"])
if header:
data.append(header)
for rows in failed_rows.values():
for row in rows:
data.append(row)
LOGGER.info(
"Writing %d records to exceptions file '%s", len(data) - 1, outfile
write_imported_rows(
failed_rows, opts, types=types, header=header, label="exceptions"
)
tofile.write_file(outfile, data)
return with_ids, without_ids


Expand Down

0 comments on commit cdc7962

Please sign in to comment.