Skip to content

Commit

Permalink
Merge pull request #558 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
New Table method for counting categories
  • Loading branch information
GavinHuttley committed Mar 6, 2020
2 parents 3af1913 + 3ecff6d commit 576f83d
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 2 deletions.
49 changes: 49 additions & 0 deletions src/cogent3/maths/stats/number.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
from collections.abc import Mapping, MutableMapping

import numpy
Expand Down Expand Up @@ -144,6 +145,54 @@ def to_array(self, keys=None):
data = numpy.array(data, dtype=int)
return data

def to_table(self, column_names=None, **kwargs):
"""converts to Table
Parameters
----------
column_names
the column name(s) for the key, defaults to "key". If a series, must
match dimensions of keys, e.g. for (a, b) keys, column_names=['A', 'B']
will result in a table with 3 columns ('A', 'B', 'count').
kwargs
passed to table constructor
Returns
-------
cogent3 Table instance
"""
from cogent3.util.table import Table

if (
not column_names
or isinstance(column_names, str)
or not hasattr(column_names, "__len__")
):
key = column_names if column_names is not None else "key"
data = {c[0]: c[1:] for c in zip([key, "count"], *list(self.items()))}
header = [key, "count"]
# if keys are tuples, construct the numpy array manually so the
# elements remain as tuples. numpy's object type casting converts
# these to lists otherwise
if type(next(iter(self))) == tuple:
num = len(data[key])
arr = numpy.empty(num, dtype=object)
for i in range(num):
arr[i] = data[key][i]
data[key] = arr
else:
for key in self:
break
assert len(key) == len(column_names), "mismatched dimensions"
data = defaultdict(list)
for key, count in self.items():
for c, e in zip(column_names, key):
data[c].append(e)
data["count"].append(count)
header = list(column_names) + ["count"]
data = dict(data)
return Table(header=header, data=data, **kwargs)

@property
def entropy(self):
data = self.to_array()
Expand Down
34 changes: 33 additions & 1 deletion src/cogent3/util/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,10 +485,16 @@ def add_column_from_str(self, name, values):
self[name] = values

def take_columns(self, columns):
"""returns new Columns instance with just columns"""
result = self.__class__()
columns = columns if isinstance(columns, str) else columns
if type(columns) in {int, str}:
columns = [columns]

columns = self._get_keys_(columns)

for c in columns:
result[c] = self[c]

return result

@property
Expand Down Expand Up @@ -1186,6 +1192,32 @@ def count(self, callback, columns=None, **kwargs):
indices = self.get_row_indices(callback=callback, columns=columns)
return indices.sum()

def count_unique(self, columns=None):
"""count occurrences of unique combinations of columns
Parameters
----------
columns
name of one or more columns. If None, all columns are used
Returns
-------
CategoryCounter instance
"""
from cogent3.maths.stats.number import CategoryCounter

if columns is None:
columns = self.columns.order

subset = self.columns.take_columns(columns)
if len(subset) == 1:
data = subset[0].tolist()
else:
data = subset.array
data = list(tuple(e) for e in data)

return CategoryCounter(data=data)

def distinct_values(self, columns):
"""returns the set of distinct values for the named column(s)"""
data = [tuple(r) for r in self[:, columns].array.tolist()]
Expand Down
30 changes: 30 additions & 0 deletions tests/test_maths/test_stats/test_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,36 @@ def test_to_methods(self):
assert_allclose(got, numpy.array([1, 3, 4, 4], dtype=int))
self.assertEqual(nums.to_dict(), dict(A=4, C=3, G=4, T=1))

def test_to_table(self):
"""produces correct Table structure"""
data = [
("Ovary-AdenoCA", "IGR"),
("Liver-HCC", "Intron"),
("Panc-AdenoCA", "Intron"),
("Panc-AdenoCA", "Intron"),
]
nums = number.CategoryCounter(data)
t = nums.to_table(column_names=None, title="blah")
self.assertEqual(t.header, ("key", "count"))
# if the key is a tuple, then the unexpanded column values are also
self.assertIsInstance(t[0, 0], tuple)
self.assertEqual(t.title, "blah")
# you can use any data type as a key, but Table column is a str
t = nums.to_table(column_names=2)
self.assertEqual(t.header, ("2", "count"))
t = nums.to_table(column_names="blah")
self.assertEqual(t.header, ("blah", "count"))
t = nums.to_table(column_names=["A", "B"])
self.assertEqual(t.header, ("A", "B", "count"))

with self.assertRaises(AssertionError):
# key does not have 3 dimensions
_ = nums.to_table(column_names=["A", "B", "C"])

with self.assertRaises(AssertionError):
# key does not have 1 dimension
_ = nums.to_table(column_names=[1])

def test_valid(self):
"""correctly identify when numbers contains numbers"""
wrong = number.NumberCounter([0, "a", 1, 1])
Expand Down
26 changes: 25 additions & 1 deletion tests/test_util/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,26 @@ def test_count(self):
self.assertEqual(t2.count("bar % 2 == 0"), 2)
self.assertEqual(t2.count("id == 0"), 0)

def test_count_unique(self):
"""correctly computes unique values"""
data = {
"Project_Code": [
"Ovary-AdenoCA",
"Liver-HCC",
"Panc-AdenoCA",
"Panc-AdenoCA",
],
"Donor_ID": ["DO46416", "DO45049", "DO51493", "DO32860"],
"Variant_Classification": ["IGR", "Intron", "Intron", "Intron"],
}
table = make_table(data=data)
co = table.count_unique(["Project_Code", "Variant_Classification"])
self.assertEqual(co[("Panc-AdenoCA", "Intron")], 2)
self.assertEqual(co[("Liver-HCC", "IGR")], 0)
co = table.count_unique("Variant_Classification")
self.assertEqual(co["Intron"], 3)
self.assertEqual(co["IGR"], 1)

def test_distinct_values(self):
"""test the table distinct_values method"""
t1 = Table(header=self.t1_header, data=self.t1_rows)
Expand Down Expand Up @@ -715,11 +735,15 @@ def test_del_column(self):

def test_take_columns(self):
"""correctly takes columns"""
t = Table(header=self.t5_header, data=self.t5_rows)
t = Table(header=self.t4_header, data=self.t4_rows)
columns = list(t.columns)
expect = tuple(columns[1:])
n = t.columns.take_columns(expect)
self.assertEqual(n.order, expect)
n = t.columns.take_columns(columns[0])
self.assertEqual(n.order, (columns[0],))
n = t.columns.take_columns(1)
self.assertEqual(n.order, (columns[1],))

def test_with_new_column(self):
"""test the table with_new_column method"""
Expand Down

0 comments on commit 576f83d

Please sign in to comment.