Merge pull request #549 from GavinHuttley/develop

Mixed changes to Table
cogent3 · Mar 3, 2020 · b27000a · b27000a
2 parents 1917865 + 9ea27cc
commit b27000a
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 35 deletions.
diff --git a/doc/examples/handling_tabular_data.rst b/doc/examples/handling_tabular_data.rst
@@ -989,7 +989,7 @@ In the above example, the data type in a column is static, e.g. all values in ``
        edge.1           root       4.0    1.0    3.0    6.0
     -------------------------------------------------------
 
-If you invoke the ``static_column_types`` argument and the column data are not static, you'll get a ``ValueError``. We show this by first creating a simple table with mixed data types in a column, write to file and then try to load with  ``static_column_types=True``.
+If you invoke the ``static_column_types`` argument and the column data are not static, you'll get back a string type.
 
 .. doctest::
 
@@ -1003,8 +1003,8 @@ If you invoke the ``static_column_types`` argument and the column data are not s
     ------
     >>> t3b.write('test3b.txt', sep='\t')
     >>> t3b = load_table('test3b.txt', sep='\t', static_column_types=True)
-    Traceback (most recent call last):
-    ValueError: invalid literal for int() with base 10: 'a'
+    >>> t3b.columns["A"]
+    array(['1', 'a'], dtype='<U1')
 
 We also test the reader function for a tab delimited format with missing data at the end.
 

diff --git a/src/cogent3/__init__.py b/src/cogent3/__init__.py
@@ -392,6 +392,7 @@ def load_table(
     static_column_types=False,
     limit=None,
     format="simple",
+    skip_inconsistent=False,
     **kwargs,
 ):
     """
@@ -445,13 +446,14 @@ def load_table(
         a pandas DataFrame, supersedes header/rows
     format
         output format when using str(Table)
-
+    skip_inconsistent
+        skips rows that have different length to header row
     """
     filename = str(filename)
     sep = sep or kwargs.pop("delimiter", None)
     file_format, compress_format = get_format_suffixes(filename)
 
-    if not (reader or static_column_types):
+    if not reader:
         if file_format == "pickle":
             f = open_(filename, mode="rb")
             loaded_table = pickle.load(f)
@@ -467,6 +469,15 @@ def load_table(
         header, rows, loaded_title, legend = load_delimited(
             filename, delimiter=sep, limit=limit, **kwargs
         )
+        if skip_inconsistent:
+            num_fields = len(header)
+            rows = [r for r in rows if len(r) == num_fields]
+        else:
+            lengths = set(map(len, [header] + rows))
+            if len(lengths) != 1:
+                msg = f"inconsistent number of fields {lengths}"
+                raise ValueError(msg)
+
         title = title or loaded_title
         data = {}
         for column in zip(header, *rows):
@@ -476,23 +487,10 @@ def load_table(
         rows = data
     else:
         f = open_(filename, newline=None)
-        if not reader:
-            if file_format == "csv":
-                sep = sep or ","
-            elif file_format == "tsv":
-                sep = sep or "\t"
-            elif not sep:
-                raise ValueError(
-                    "static_column_types option requires a value " "for sep"
-                )
-
-            reader = autogen_reader(
-                f, sep, limit=limit, with_title=kwargs.get("with_title", False)
-            )
-
         rows = [row for row in reader(f)]
         f.close()
         header = rows.pop(0)
+
     return make_table(
         header=header,
         data=rows,

diff --git a/src/cogent3/parse/table.py b/src/cogent3/parse/table.py
@@ -172,6 +172,11 @@ def load_delimited(
         f = open(filename, newline=None)
 
     reader = csv.reader(f, dialect="excel", delimiter=delimiter)
+    if with_title:
+        title = "".join(next(reader))
+    else:
+        title = ""
+
     rows = []
     num_lines = 0
     for row in reader:
@@ -180,10 +185,6 @@ def load_delimited(
         if limit is not None and num_lines >= limit:
             break
     f.close()
-    if with_title:
-        title = "".join(rows.pop(0))
-    else:
-        title = ""
     if header:
         header = rows.pop(0)
     else:
@@ -193,16 +194,4 @@ def load_delimited(
     else:
         legend = ""
     # now do type casting in the order int, float, default is string
-    # for row in rows:
-    #     for cdex, cell in enumerate(row):
-    #         try:
-    #             cell = int(cell)
-    #             row[cdex] = cell
-    #         except ValueError:
-    #             try:
-    #                 cell = float(cell)
-    #                 row[cdex] = cell
-    #             except ValueError:
-    #                 pass
-    #             pass
     return header, rows, title, legend
diff --git a/tests/test_util/test_table.py b/tests/test_util/test_table.py
@@ -157,6 +157,16 @@ def test_indexing_rows(self):
         t = Table(header=self.t7_header, data=self.t7_rows, row_ids="gene")
         self.assertEqual(t["ENSG00000019485", "chrom"], "A")
 
+    def test_immutability_cells(self):
+        """table cells are immutable"""
+        t = Table(header=self.t7_header, data=self.t7_rows, row_ids="gene")
+        with self.assertRaises(TypeError):
+            t["ENSG00000019485", "chrom"] = "D"
+
+        # even via column instance
+        with self.assertRaises(ValueError):
+            t.columns["chrom"]["ENSG00000019485"] = "D"
+
     def test_slicing_table(self):
         """works using column names, ints, bool array"""
         t = Table(header=self.t5_header, data=self.t5_rows)
@@ -816,6 +826,33 @@ def test_load_mixed(self):
             self.assertEqual(str(t), str(r))
             self.assertTrue("float", r.columns["float"].dtype.name)
 
+    def test_load_mixed_static(self):
+        """load data, mixed data type columns remain as string"""
+        t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]])
+        with TemporaryDirectory(".") as dirname:
+            path = pathlib.Path(dirname) / "table.txt"
+            t.write(str(path), sep="\t")
+            # if static types, then mixed columns become strings
+            r = load_table(path, sep="\t", static_column_types=True)
+            self.assertTrue("str" in r.columns["A"].dtype.name)
+
+    def test_load_mixed_row_lengths(self):
+        """skip_inconsistent skips rows that have different length to header"""
+        h = list("ABCDE")
+        r = [list("12345"), list("000"), list("12345")]
+        text = "\n".join(["\t".join(l) for l in [h] + r])
+        with TemporaryDirectory(".") as dirname:
+            path = pathlib.Path(dirname) / "table.tsv"
+            with open(path, "w") as out:
+                out.write(text)
+            r = load_table(path, skip_inconsistent=True)
+            self.assertEqual(r.shape, (2, 5))
+            self.assertEqual(r.header, tuple(h))
+            self.assertEqual(r.array.tolist(), [list(range(1, 6))] * 2)
+            # loading without skip_inconsistent raise ValueError
+            with self.assertRaises(ValueError):
+                r = load_table(path, skip_inconsistent=False)
+
     def test_load_table_returns_static_columns(self):
         """for static data, load_table gives same dtypes for static_columns_type=True/False"""
         t = load_table("data/sample.tsv", sep="\t", static_column_types=False)