Skip to content

Commit

Permalink
Merge pull request #549 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
Mixed changes to Table
  • Loading branch information
GavinHuttley committed Mar 3, 2020
2 parents 1917865 + 9ea27cc commit b27000a
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 35 deletions.
6 changes: 3 additions & 3 deletions doc/examples/handling_tabular_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -989,7 +989,7 @@ In the above example, the data type in a column is static, e.g. all values in ``
edge.1 root 4.0 1.0 3.0 6.0
-------------------------------------------------------

If you invoke the ``static_column_types`` argument and the column data are not static, you'll get a ``ValueError``. We show this by first creating a simple table with mixed data types in a column, write to file and then try to load with ``static_column_types=True``.
If you invoke the ``static_column_types`` argument and the column data are not static, you'll get back a string type.

.. doctest::

Expand All @@ -1003,8 +1003,8 @@ If you invoke the ``static_column_types`` argument and the column data are not s
------
>>> t3b.write('test3b.txt', sep='\t')
>>> t3b = load_table('test3b.txt', sep='\t', static_column_types=True)
Traceback (most recent call last):
ValueError: invalid literal for int() with base 10: 'a'
>>> t3b.columns["A"]
array(['1', 'a'], dtype='<U1')

We also test the reader function for a tab delimited format with missing data at the end.

Expand Down
30 changes: 14 additions & 16 deletions src/cogent3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ def load_table(
static_column_types=False,
limit=None,
format="simple",
skip_inconsistent=False,
**kwargs,
):
"""
Expand Down Expand Up @@ -445,13 +446,14 @@ def load_table(
a pandas DataFrame, supersedes header/rows
format
output format when using str(Table)
skip_inconsistent
skips rows that have different length to header row
"""
filename = str(filename)
sep = sep or kwargs.pop("delimiter", None)
file_format, compress_format = get_format_suffixes(filename)

if not (reader or static_column_types):
if not reader:
if file_format == "pickle":
f = open_(filename, mode="rb")
loaded_table = pickle.load(f)
Expand All @@ -467,6 +469,15 @@ def load_table(
header, rows, loaded_title, legend = load_delimited(
filename, delimiter=sep, limit=limit, **kwargs
)
if skip_inconsistent:
num_fields = len(header)
rows = [r for r in rows if len(r) == num_fields]
else:
lengths = set(map(len, [header] + rows))
if len(lengths) != 1:
msg = f"inconsistent number of fields {lengths}"
raise ValueError(msg)

title = title or loaded_title
data = {}
for column in zip(header, *rows):
Expand All @@ -476,23 +487,10 @@ def load_table(
rows = data
else:
f = open_(filename, newline=None)
if not reader:
if file_format == "csv":
sep = sep or ","
elif file_format == "tsv":
sep = sep or "\t"
elif not sep:
raise ValueError(
"static_column_types option requires a value " "for sep"
)

reader = autogen_reader(
f, sep, limit=limit, with_title=kwargs.get("with_title", False)
)

rows = [row for row in reader(f)]
f.close()
header = rows.pop(0)

return make_table(
header=header,
data=rows,
Expand Down
21 changes: 5 additions & 16 deletions src/cogent3/parse/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ def load_delimited(
f = open(filename, newline=None)

reader = csv.reader(f, dialect="excel", delimiter=delimiter)
if with_title:
title = "".join(next(reader))
else:
title = ""

rows = []
num_lines = 0
for row in reader:
Expand All @@ -180,10 +185,6 @@ def load_delimited(
if limit is not None and num_lines >= limit:
break
f.close()
if with_title:
title = "".join(rows.pop(0))
else:
title = ""
if header:
header = rows.pop(0)
else:
Expand All @@ -193,16 +194,4 @@ def load_delimited(
else:
legend = ""
# now do type casting in the order int, float, default is string
# for row in rows:
# for cdex, cell in enumerate(row):
# try:
# cell = int(cell)
# row[cdex] = cell
# except ValueError:
# try:
# cell = float(cell)
# row[cdex] = cell
# except ValueError:
# pass
# pass
return header, rows, title, legend
37 changes: 37 additions & 0 deletions tests/test_util/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,16 @@ def test_indexing_rows(self):
t = Table(header=self.t7_header, data=self.t7_rows, row_ids="gene")
self.assertEqual(t["ENSG00000019485", "chrom"], "A")

def test_immutability_cells(self):
"""table cells are immutable"""
t = Table(header=self.t7_header, data=self.t7_rows, row_ids="gene")
with self.assertRaises(TypeError):
t["ENSG00000019485", "chrom"] = "D"

# even via column instance
with self.assertRaises(ValueError):
t.columns["chrom"]["ENSG00000019485"] = "D"

def test_slicing_table(self):
"""works using column names, ints, bool array"""
t = Table(header=self.t5_header, data=self.t5_rows)
Expand Down Expand Up @@ -816,6 +826,33 @@ def test_load_mixed(self):
self.assertEqual(str(t), str(r))
self.assertTrue("float", r.columns["float"].dtype.name)

def test_load_mixed_static(self):
"""load data, mixed data type columns remain as string"""
t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]])
with TemporaryDirectory(".") as dirname:
path = pathlib.Path(dirname) / "table.txt"
t.write(str(path), sep="\t")
# if static types, then mixed columns become strings
r = load_table(path, sep="\t", static_column_types=True)
self.assertTrue("str" in r.columns["A"].dtype.name)

def test_load_mixed_row_lengths(self):
"""skip_inconsistent skips rows that have different length to header"""
h = list("ABCDE")
r = [list("12345"), list("000"), list("12345")]
text = "\n".join(["\t".join(l) for l in [h] + r])
with TemporaryDirectory(".") as dirname:
path = pathlib.Path(dirname) / "table.tsv"
with open(path, "w") as out:
out.write(text)
r = load_table(path, skip_inconsistent=True)
self.assertEqual(r.shape, (2, 5))
self.assertEqual(r.header, tuple(h))
self.assertEqual(r.array.tolist(), [list(range(1, 6))] * 2)
# loading without skip_inconsistent raise ValueError
with self.assertRaises(ValueError):
r = load_table(path, skip_inconsistent=False)

def test_load_table_returns_static_columns(self):
"""for static data, load_table gives same dtypes for static_columns_type=True/False"""
t = load_table("data/sample.tsv", sep="\t", static_column_types=False)
Expand Down

0 comments on commit b27000a

Please sign in to comment.