v0.10.0.dev0: improvements to display of search results

* display search results `--as-text` * select search results to save to file using the `--filter-rows` option * if concatenating search results to text file then option `--no-header` to only include results * new classes to `sfftk.notes.find`: `CSVTable` and `CSVRow` * associated unit tests with minor cleanup
emdb-empiar · Oct 24, 2023 · bcddd64 · bcddd64
1 parent 51807db
commit bcddd64
Show file tree

Hide file tree

Showing 9 changed files with 328 additions and 86 deletions.
diff --git a/sfftk/__init__.py b/sfftk/__init__.py
@@ -3,4 +3,4 @@
 
 BASE_DIR = os.path.dirname(__file__)
 
-SFFTK_VERSION = 'v0.9.0.dev2'
+SFFTK_VERSION = 'v0.10.0.dev0'
diff --git a/sfftk/core/parser.py b/sfftk/core/parser.py
@@ -678,6 +678,19 @@
 search_notes_parser.add_argument(
     '--rows', type=int, default=10, help="number of rows [default: 10]"
 )
+search_notes_parser.add_argument(
+    '--as-text', action='store_true', help="output as CSV [default: False]"
+)
+search_notes_parser.add_argument(
+    '--filter-rows', nargs='*',
+    help="space-separated list of search result rows to display; "
+         "only works when --as-text flag is selected [default: False]"
+)
+search_notes_parser.add_argument(
+    '--no-header', action='store_true',
+    help="do not show CSV header (useful when concatenating "
+         "search results to an existing file which already has the header) [default: False]"
+)
 ols_parser = search_notes_parser.add_argument_group(
     title='EBI Ontology Lookup Service (OLS)',
     description='The Ontology Lookup Service (OLS) is a repository for biomedical ontologies that aims to provide a '
@@ -1388,17 +1401,17 @@ def parse_args(_args, use_shlex=False):
                 print_date(f"error: mergemask can handle at most 255 masks ({len(args.masks)} provided)")
                 return 64, configs
             if not _masks_exist(args):
-                print_date(f"error: one or more masks missing; please verify that all paths are correct")
+                print_date("error: one or more masks missing; please verify that all paths are correct")
                 return 65, configs
             if not _mask_all_correct_files(args):
-                print_date(f"error: one or more invalid file formats; please retry")
+                print_date("error: one or more invalid file formats; please retry")
                 return 65, configs
             if not _masks_have_same_dimensions(args):
                 print_date(
-                    f"error: inhomogeneous masks: dimension differs between masks (use --verbose to view details)")
+                    "error: inhomogeneous masks: dimension differs between masks (use --verbose to view details)")
                 return 65, configs
             if not _masks_have_mode_zero(args):
-                print_date(f"error: mode must be zero (0); please run `sff prep binmap` first on all masks")
+                print_date("error: mode must be zero (0); please run `sff prep binmap` first on all masks")
                 return 65, configs
         # starsplit
         elif args.prep_subcommand == 'starsplit':
@@ -1412,7 +1425,7 @@ def parse_args(_args, use_shlex=False):
             if args.output is None:
                 args.output = f"{pathlib.Path(args.star_file).stem}_{args.infix}_{args.rows}.star"
             if args.rows <= 0:
-                print_date(f"error: rows must be positive")
+                print_date("error: rows must be positive")
                 return 65, configs
 
     # view
@@ -1629,6 +1642,23 @@ def parse_args(_args, use_shlex=False):
             ):
                 print_date("Invalid usage: -O, -x, -o, -L, -l can only be used with -R ols")
                 return 64, None
+            if args.as_text:
+                if args.filter_rows:
+                    # make sure all the values are digits
+                    try:
+                        assert all(map(lambda i: i.isdecimal(), args.filter_rows))
+                    except AssertionError:
+                        print_date("Invalid filter rows: {}; should be a comma-separated list of digits".format(
+                            args.filter_rows))
+                        return 64, configs
+                    # now validate the filter rows against --start and --rows
+                    # valid values will ensure that there is a non-empty intersection between the set of --filter-row values and the range {--start,..,(--start+--rows)-1}
+                    filter_row_values = set(list(map(int, args.filter_rows)))
+                    valid_index_values = set(range(args.start, args.start + args.rows))
+                    if len(filter_row_values.intersection(valid_index_values)) == 0:
+                        print_date(
+                            f"Invalid filter rows: {args.filter_rows}; should be in range {args.start}-{args.start + args.rows - 1}")
+                        return 64, configs
         elif args.notes_subcommand == "show":
             if args.segment_id is not None:
                 args.segment_id = list(map(int, args.segment_id.split(',')))

diff --git a/sfftk/core/prep.py b/sfftk/core/prep.py
@@ -108,16 +108,19 @@ class MergedMask:
         new_labels = {15, 18, 19}
         label_tree[15] = [5, 10]
         label_tree[18] = [8, 10]
-        label_tree[19] = [9, 10] # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10], 18: [8, 10], 19: [9, 10]}
+        label_tree[19] = [9, 10] # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10],
+        18: [8, 10], 19: [9, 10]}
         label_set = {1, 2, 3, 4, 5, 10, 15, 18, 19}
         label = numpy.amax(merge_mask) + 1 = 20
         # mask 6
         merged_mask = [10, 18, 19, 15] + [1, 0, 1, 0] * 20 = [30, 18, 39, 15]
         label_set = {1, 2, 3, 4, 5, 10, 15, 18, 19, 20}
-        label_tree[20] = 0 # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10], 18: [8, 10], 19: [9, 10], 20: 0}
+        label_tree[20] = 0 # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10],
+        18: [8, 10], 19: [9, 10], 20: 0}
         new_labels = {30, 39}
         label_tree[30] = [10, 20]
-        label_tree[39] = [19, 20] # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10], 18: [8, 10], 19: [9, 10], 20: 0, 30: [10, 20], 39: [19, 20]}
+        label_tree[39] = [19, 20] # => {1: 0, 2: 0, 3: [1, 2], 4: 0, 5: 0, 8: [3, 5], 9: [4, 5], 10: 0, 15: [5, 10],
+        18: [8, 10], 19: [9, 10], 20: 0, 30: [10, 20], 39: [19, 20]}
         label_set = {1, 2, 3, 4, 5, 10, 15, 18, 19, 20, 30, 39}
         label = numpy.amax(merge_mask) + 1 = 40
 
@@ -516,28 +519,28 @@ def mergemask(args, configs):
     # fail fast: ensure the output does not exist
     outfile = pathlib.Path(f"{args.output_prefix}.{args.mask_extension}")
     if not args.overwrite and outfile.exists():
-        print_date(f"error: the file already exists; use --overwrite to overwrite the existing merged_mask or set a "
-                   f"new output prefix using --output-prefix")
+        print_date("error: the file already exists; use --overwrite to overwrite the existing merged_mask or set a "
+                   "new output prefix using --output-prefix")
         return 64
     # ensure that the files are binary
     if args.skip_assessment:
         print_date("info: skipping mask assessment; assuming all masks are binary...")
     elif not _masks_all_binary(args, configs) and not args.skip:
-        print_date(f"error: one or more masks are non-binary; use --verbose to view details")
+        print_date("error: one or more masks are non-binary; use --verbose to view details")
         return 65
     # todo: allow cases where one or more files are non-binary
     # ensure that they don't overlap each other
     if not _masks_no_overlap(args, configs) and not args.allow_overlap:
-        print_date(f"error: one or more masks overlap; use --verbose to view details")
-        print_date(f"info: if overlapping segments are expected re-run with the --allow-overlap argument; "
-                   f"see 'sff prep mergemask' for more information")
+        print_date("error: one or more masks overlap; use --verbose to view details")
+        print_date("info: if overlapping segments are expected re-run with the --allow-overlap argument; "
+                   "see 'sff prep mergemask' for more information")
         return 65
     # now we can merge masks
     if args.verbose:
-        print_date(f"info: proceeding to merge masks...")
+        print_date("info: proceeding to merge masks...")
     merged_mask = _mergemask(args.masks)
     if args.verbose:
-        print_date(f"info: merge complete...")
+        print_date("info: merge complete...")
     if args.verbose:
         print_date(f"info: attempting to write output to '{args.output_prefix}.{args.mask_extension}'...")
     with mrcfile.new(f"{args.output_prefix}.{args.mask_extension}", overwrite=args.overwrite) as mrc:
@@ -556,7 +559,7 @@ def mergemask(args, configs):
             print_date(f"info: mask metadata:\n{data}")
         print(data, file=label_file)
     if args.verbose:
-        print_date(f"info: merge complete!")
+        print_date("info: merge complete!")
     return 0
 
 

diff --git a/sfftk/notes/find.py b/sfftk/notes/find.py
@@ -216,7 +216,7 @@ def search(self, *args, **kwargs):
             R = requests.get(url)
             if R.status_code == 200:
                 self._response = R.text
-                return SearchResults(self)
+                return SearchResults(self, *args, **kwargs)
             else:
                 print_date("Error: server responded with {}".format(R.text))
                 return None
@@ -397,6 +397,7 @@ def pc(self):
 
     def render(self, row_data, index):
         """Render this field"""
+        text = ''
         if self.is_index:
             text = _str(index)
         elif self._key is not None:
@@ -492,6 +493,37 @@ def __str__(self):
         return string
 
 
+class CSVRow(Table):
+    """Class definition for a single row in the table"""
+    column_separator = "\t"
+
+    def __init__(self, row_data, fields, index, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._row_data = row_data
+        self._fields = fields
+        self._index = index
+        self._rendered = self._render()
+
+    def _render(self):
+        """We undo the wrapping here"""
+        rendered = list()
+        for field in self._fields:
+            rendered_field = field.render(self._row_data, self._index)
+            stripped_rendered_field = list(map(lambda f: f.strip(), rendered_field))
+            # we will strip all leading and trailing whitespace and join all wrapped lines
+            if str(field).strip() in ['description']:  # the description needs spaces
+                rendered.append(' '.join(stripped_rendered_field).strip())
+            else:
+                rendered.append(''.join(stripped_rendered_field))
+        return rendered
+
+    def __bytes__(self):
+        return self.__str__().encode('utf-8')
+
+    def __str__(self):
+        return self.column_separator.join(self._rendered).strip('\n')
+
+
 class ResultsTable(Table):
     """Class that formats search results as a table"""
 
@@ -641,6 +673,65 @@ def __str__(self):
         return _str(string)
 
 
+class CSVTable(Table):
+    """Class that formats search results as a CSV"""
+    column_separator = "\t"
+
+    def __init__(self, search_results, fields, width='auto', *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._search_results = search_results
+        self._fields = fields
+
+    @property
+    def header(self):
+        header = ""
+        if self._search_results.search_args.no_header:
+            return header
+        field_names = list(map(lambda f: str(f).strip(), self._fields))
+        header += self.column_separator.join(field_names) + self.row_separator
+        return header
+
+    @property
+    def body(self):
+        index = self._search_results.search_args.start
+        if self._search_results.results is not None:
+            body = ""
+            for row_data in self._search_results.results:
+                # if the user is using --filter-rows then only include rows with the specified indexes
+                if self._search_results.search_args.filter_rows:
+                    if str(index) in self._search_results.search_args.filter_rows:
+                        row_string = str(CSVRow(row_data, self._fields, index)) + self.row_separator
+                        body += row_string
+                        index += 1
+                        continue
+                    index += 1
+                else:
+                    # otherwise display all rows
+                    row_string = str(CSVRow(row_data, self._fields, index)) + self.row_separator
+                    body += row_string
+                    index += 1
+        else:
+            body = '\nNo data found at this time. Please try again in a few minutes.'.center(
+                self._width) + self.row_separator
+            body += self.row_separator
+            body += "-" * self._width + self.row_separator
+        return body.strip('\n')
+
+    @property
+    def footer(self):
+        return ""
+
+    def __bytes__(self):
+        return self.__str__().encode('utf-8')
+
+    def __str__(self):
+        string = ""
+        string += self.header
+        string += self.body
+        string += self.footer
+        return _str(string)
+
+
 class SearchResults(object):
     """SearchResults class"""
     _width = 180  # unreasonable default
@@ -651,7 +742,7 @@ class SearchResults(object):
     DESCRIPTION_WIDTH = 80
     TYPE_WIDTH = 18
 
-    def __init__(self, resource):
+    def __init__(self, resource, as_text=False):
         self._resource = resource  # the resource that was searched
         self._raw_response = resource.response
         self._structured_response = self._structure_response()
@@ -660,6 +751,7 @@ def __init__(self, resource):
             self._width = terminal_size.columns
         else:
             self._width = self._width
+        self.as_text = as_text
 
     @property
     def structured_response(self):
@@ -704,9 +796,9 @@ def __bytes__(self):
         return self.__str__().encode('utf-8')
 
     def __str__(self):
-        return self.tabulate()
+        return self.tabulate(as_text=self.as_text)
 
-    def tabulate(self):
+    def tabulate(self, as_text=False):
         """Tabulate the search results"""
         table = Styled("[[ ''|fg-yellow:no-end ]]")  # ""
         if self._resource.name == 'OLS':
@@ -751,7 +843,11 @@ def tabulate(self):
                     TableField('accession', key='short_form', pc=10, justify='center'),
                     TableField('description', key='description', pc=40, is_iterable=True),
                 ]
-                table += _str(ResultsTable(self, fields=fields))
+                if as_text:
+                    # exclude colour decoration
+                    table = str(CSVTable(self, fields=fields))
+                else:
+                    table += _str(ResultsTable(self, fields=fields))
         elif self._resource.name == 'GO':
             fields = [
                 TableField('index', key='index', pc=5, is_index=True, justify='right'),
@@ -761,7 +857,10 @@ def tabulate(self):
                 TableField('accession', key='short_form', pc=10, justify='center'),
                 TableField('description', key='description', pc=40, is_iterable=True),
             ]
-            table += _str(ResultsTable(self, fields=fields))
+            if as_text:
+                table = str(CSVTable(self, fields=fields))
+            else:
+                table += _str(ResultsTable(self, fields=fields))
         elif self._resource.name == 'EMDB':
             fields = [
                 TableField('index', key='index', pc=5, is_index=True, justify='right'),
@@ -771,7 +870,10 @@ def tabulate(self):
                 TableField('accession', key='emdb_id', pc=10, _format='{}', justify='center'),
                 TableField('description', key=['admin', 'title'], pc=40),
             ]
-            table += _str(ResultsTable(self, fields=fields))
+            if as_text:
+                table = str(CSVTable(self, fields=fields))
+            else:
+                table += _str(ResultsTable(self, fields=fields))
         elif self._resource.name == "UniProt":
             fields = [
                 TableField('index', pc=5, is_index=True, justify='right'),
@@ -793,7 +895,10 @@ def tabulate(self):
                 # TableField('title', key='organism_scientific_name', pc=20, is_iterable=True),
                 TableField('description', key='title', pc=40),
             ]
-            table += _str(ResultsTable(self, fields=fields))
+            if as_text:
+                table = str(CSVTable(self, fields=fields))
+            else:
+                table += _str(ResultsTable(self, fields=fields))
         elif self._resource.name == 'Europe PMC':
             fields = [
                 TableField('index', pc=5, is_index=True, justify='right'),
@@ -804,7 +909,10 @@ def tabulate(self):
                 TableField('description (title)', key='title', pc=25),
                 # TableField('iri (doi)', key='doi', _format='https://doi.org/{}', pc=30)
             ]
-            table += _str(ResultsTable(self, fields=fields))
+            if as_text:
+                table = str(CSVTable(self, fields=fields))
+            else:
+                table += _str(ResultsTable(self, fields=fields))
         elif self._resource.name == 'EMPIAR':
             fields = [
                 TableField('index', pc=5, is_index=True, justify='right'),
@@ -822,8 +930,13 @@ def tabulate(self):
                 self.structured_response[empiar_accession]['empiarid'] = empiar_accession
                 structured_response.append(self.structured_response[empiar_accession])
             self._structured_response = structured_response
-            table += _str(ResultsTable(self, fields=fields))
+            if as_text:
+                table = str(CSVTable(self, fields=fields))
+            else:
+                table += _str(ResultsTable(self, fields=fields))
         # close style
+        if as_text:
+            return table
         table += Styled("[[ ''|reset ]]")
         return _str(table)
 

diff --git a/sfftk/sff.py b/sfftk/sff.py
@@ -208,9 +208,10 @@ def handle_notes_search(args, configs):
     # query
     resource = find.SearchResource(args, configs)
     # fixme: use print_date
-    print(resource)
+    if not args.as_text:
+        print(resource)
     # search
-    result = resource.search()
+    result = resource.search(as_text=args.as_text)
     if result is not None:
         # fixme: use print_date
         print(result)