diff --git a/src/genomehubs/lib/hub.py b/src/genomehubs/lib/hub.py index 6da6d4d4..e3f1315a 100644 --- a/src/genomehubs/lib/hub.py +++ b/src/genomehubs/lib/hub.py @@ -503,3 +503,31 @@ def set_column_indices(types, header): index = headers.get(value["header"], None) if index is not None: value.update({"index": index}) + + +def write_imported_rows(rows, opts, *, types, header=None, label="imported"): + """Write imported rows to processed file.""" + file_key = "%s-exception" % opts["index"] + dir_key = "%s-dir" % opts["index"] + if file_key in opts and opts[file_key]: + outdir = opts[file_key] + else: + outdir = "%s/%s" % (opts[dir_key], label) + os.makedirs(outdir, exist_ok=True) + outfile = "%s/%s" % (outdir, types["file"]["name"]) + data = [] + header_len = 0 + if header is not None: + data.append(header) + header_len = 1 + if isinstance(rows, dict): + for row_set in rows.values(): + for row in row_set: + data.append(row) + else: + for row in rows: + data.append(row) + LOGGER.info( + "Writing %d records to %s file '%s", len(data) - header_len, label, outfile + ) + tofile.write_file(outfile, data) diff --git a/src/genomehubs/lib/index.py b/src/genomehubs/lib/index.py index 5a1c0445..344c5e0b 100644 --- a/src/genomehubs/lib/index.py +++ b/src/genomehubs/lib/index.py @@ -71,6 +71,7 @@ from .hub import process_row from .hub import set_column_indices from .hub import validate_types_file +from .hub import write_imported_rows from .taxon import add_names_and_attributes_to_taxa from .taxon import fix_missing_ids from .version import __version__ @@ -84,14 +85,16 @@ def index_file(es, types, data, opts): rows = csv.reader( data, delimiter=delimiters[types["file"]["format"]], quotechar='"' ) - header = None - if types["file"].get("header", False): + if "header" in types["file"] and types["file"]["header"]: header = next(rows) set_column_indices(types, header) + else: + header = None with_ids = defaultdict(list) taxon_asm_data = defaultdict(list) without_ids = defaultdict(list) failed_rows = defaultdict(list) + imported_rows = [] blanks = set(["", "NA", "N/A", "None"]) taxon_types = {} for taxonomy_name in opts["taxonomy-source"]: @@ -112,6 +115,7 @@ def index_file(es, types, data, opts): taxon_asm_data[processed_data["taxonomy"]["taxon_id"]].append( taxon_data ) + imported_rows.append(row) else: if "taxonomy" in types and "alt_taxon_id" in types["taxonomy"]: without_ids[processed_data["taxonomy"]["alt_taxon_id"]].append( @@ -147,11 +151,15 @@ def index_file(es, types, data, opts): types=types, taxon_template=taxon_template, failed_rows=failed_rows, + imported_rows=imported_rows, with_ids=with_ids, blanks=blanks, header=header, ) if with_ids or create_ids: + write_imported_rows( + imported_rows, opts, types=types, header=header, label="imported" + ) LOGGER.info("Indexing %d entries", len(with_ids.keys())) if opts["index"] == "taxon": docs = add_names_and_attributes_to_taxa( @@ -164,6 +172,7 @@ def index_file(es, types, data, opts): _op_type="update", ) elif opts["index"] == "assembly": + # TODO: keep track of taxon_id not found exceptions assembly_template = assembly.index_template(taxonomy_name, opts) docs = add_identifiers_and_attributes_to_assemblies( es, diff --git a/src/genomehubs/lib/taxon.py b/src/genomehubs/lib/taxon.py index 19187de3..c60e7dd6 100644 --- a/src/genomehubs/lib/taxon.py +++ b/src/genomehubs/lib/taxon.py @@ -2,11 +2,9 @@ """Taxon methods.""" -import os import sys from collections import defaultdict -from tolkein import tofile from tolkein import tolog from tqdm import tqdm @@ -17,6 +15,7 @@ from .hub import add_attribute_values from .hub import chunks from .hub import index_templator +from .hub import write_imported_rows from .taxonomy import index_template as taxonomy_index_template LOGGER = tolog.logger(__name__) @@ -177,6 +176,7 @@ def fix_missing_ids( types, taxon_template, failed_rows, + imported_rows, with_ids=None, blanks=set(["NA", "None"]), header=None, @@ -211,29 +211,15 @@ def fix_missing_ids( if without_ids and failed_rows: for key, value in found_ids.items(): if key in failed_rows: + imported_rows += failed_rows[key] del failed_rows[key] if failed_rows: LOGGER.info( "Unable to associate %d records with taxon IDs", len(failed_rows) ) - data = [] - exception_key = "%s-exception" % opts["index"] - dir_key = "%s-dir" % opts["index"] - if exception_key in opts and opts[exception_key]: - outdir = opts[exception_key] - else: - outdir = "%s/exceptions" % opts[dir_key] - os.makedirs(outdir, exist_ok=True) - outfile = "%s/%s" % (outdir, types["file"]["name"]) - if header: - data.append(header) - for rows in failed_rows.values(): - for row in rows: - data.append(row) - LOGGER.info( - "Writing %d records to exceptions file '%s", len(data) - 1, outfile + write_imported_rows( + failed_rows, opts, types=types, header=header, label="exceptions" ) - tofile.write_file(outfile, data) return with_ids, without_ids