Skip to content

Commit

Permalink
Added Overall Categorical Statistics
Browse files Browse the repository at this point in the history
Added total and num_of_groups properties to Categorical.
Added the Overall Statistics to categorical stats.
Updated unit tests.
  • Loading branch information
cmmorrow committed Mar 26, 2018
1 parent d47b28b commit 523e5dc
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 29 deletions.
1 change: 1 addition & 0 deletions .cache/v/cache/lastfailed
@@ -1,3 +1,4 @@
{
"sci_analysis/test/test_graph_frequency.py": true,
"sci_analysis/test/test_groupcorrelation.py::MyTestCase::test_pearson_correlation_different_alpha": true
}
4 changes: 2 additions & 2 deletions docs/index.rst
Expand Up @@ -141,7 +141,7 @@ A histogram and statistics for categorical data can be performed with the follow

::
pets = ['dog', 'cat', 'rat', 'cat', 'rabbit', 'dog', 'hampster', 'cat', 'rabbit', 'dog', 'dog']
pets = ['dog', 'cat', 'rat', 'cat', 'rabbit', 'dog', 'hamster', 'cat', 'rabbit', 'dog', 'dog']
analyze(pets)

A histogram and printed output similar to that below should be shown:
Expand All @@ -158,7 +158,7 @@ A histogram and printed output similar to that below should be shown:
1 4 36.3636 dog
2 3 27.2727 cat
3 2 18.1818 rabbit
4 1 9.0909 hampster
4 1 9.0909 hamster
4 1 9.0909 rat

Let's examine the ``analyze`` function in more detail. Here's the signature for the ``analyze`` function:
Expand Down
30 changes: 25 additions & 5 deletions sci_analysis/analysis/stats.py
Expand Up @@ -261,6 +261,7 @@ class GroupStatisticsStacked(Analysis):

_min_size = 1
_name = 'Group Statistics'
_agg_name = 'Overall Statistics'
_group = 'Group'
_n = 'n'
_mean = 'Mean'
Expand Down Expand Up @@ -351,7 +352,7 @@ def __str__(self):
)
if is_tuple(self._results):
out = '{}\n{}'.format(
std_output('Overall Statistics', self._results[0], order=order),
std_output(self._agg_name, self._results[0], order=order),
std_output(self._name, self._results[1].to_dict(orient='records'), order=group_order),
)
else:
Expand All @@ -376,11 +377,13 @@ class CategoricalStatistics(Analysis):

_min_size = 1
_name = 'Statistics'
_agg_name = 'Overall Statistics'
_rank = 'Rank'
_cat = 'Category'
_freq = 'Frequency'
_perc = 'Percent'
_total = 'Total'
_num_of_grps = 'Number of Groups'

def __init__(self, data, **kwargs):
order = kwargs['order'] if 'order' in kwargs else None
Expand All @@ -400,13 +403,30 @@ def run(self):
percents=self._perc,
ranks=self._rank)
self.data.summary.rename(columns=col, inplace=True)
self._results = self.data.summary.to_dict(orient='records')
if self.data.num_of_groups > 1:
self._results = ({
self._total: self.data.total,
self._num_of_grps: self.data.num_of_groups,
}, self.data.summary.to_dict(orient='records'))
else:
self._results = self.data.summary.to_dict(orient='records')

def __str__(self):
order = [
order = (
self._total,
self._num_of_grps,
)
grp_order = (
self._rank,
self._freq,
self._perc,
self._cat,
]
return std_output(self._name, self._results, order=order)
)
if is_tuple(self._results):
out = '{}\n{}'.format(
std_output(self._agg_name, self._results[0], order=order),
std_output(self._name, self._results[1], order=grp_order),
)
else:
out = std_output(self._name, self._results, order=grp_order)
return out
8 changes: 8 additions & 0 deletions sci_analysis/data/categorical.py
Expand Up @@ -132,3 +132,11 @@ def ranks(self):
@property
def categories(self):
return self._summary.categories

@property
def total(self):
return len(self._values)

@property
def num_of_groups(self):
return len(self._summary)
81 changes: 59 additions & 22 deletions sci_analysis/test/test_cat_statistics.py
Expand Up @@ -16,6 +16,13 @@ def test_100_categorical_stats_simple_unordered(self):
obj = CategoricalStatistics(input_array, display=False)
output = """
Overall Statistics
------------------
Total = 4
Number of Groups = 3
Statistics
----------
Expand All @@ -27,15 +34,23 @@ def test_100_categorical_stats_simple_unordered(self):
self.assertEqual(str(obj), output)
self.assertEqual(obj.name, 'Statistics')
self.assertTrue(obj.data.data.equals(Series(input_array).astype('category')))
self.assertListEqual(obj.results, [{'Rank': 1, 'Category': 'one', 'Frequency': 2, 'Percent': 50.0},
{'Rank': 2, 'Category': 'three', 'Frequency': 1, 'Percent': 25.0},
{'Rank': 2, 'Category': 'two', 'Frequency': 1, 'Percent': 25.0}])
self.assertDictEqual(obj.results[0], {'Total': 4, 'Number of Groups': 3})
self.assertListEqual(obj.results[1], [{'Rank': 1, 'Category': 'one', 'Frequency': 2, 'Percent': 50.0},
{'Rank': 2, 'Category': 'three', 'Frequency': 1, 'Percent': 25.0},
{'Rank': 2, 'Category': 'two', 'Frequency': 1, 'Percent': 25.0}])

def test_101_categorical_stats_simple_ordered_categories(self):
input_array = ['one', 'two', 'one', 'three']
obj = CategoricalStatistics(input_array, order=['three', 'two', 'one'], display=False)
output = """
Overall Statistics
------------------
Total = 4
Number of Groups = 3
Statistics
----------
Expand All @@ -45,9 +60,10 @@ def test_101_categorical_stats_simple_ordered_categories(self):
2 1 25.0000 two
1 2 50.0000 one """
self.assertEqual(str(obj), output)
self.assertListEqual(obj.results, [{'Frequency': 1, 'Category': 'three', 'Rank': 2, 'Percent': 25},
{'Frequency': 1, 'Category': 'two', 'Rank': 2, 'Percent': 25},
{'Frequency': 2, 'Category': 'one', 'Rank': 1, 'Percent': 50}])
self.assertDictEqual(obj.results[0], {'Total': 4, 'Number of Groups': 3})
self.assertListEqual(obj.results[1], [{'Frequency': 1, 'Category': 'three', 'Rank': 2, 'Percent': 25},
{'Frequency': 1, 'Category': 'two', 'Rank': 2, 'Percent': 25},
{'Frequency': 2, 'Category': 'one', 'Rank': 1, 'Percent': 50}])

def test_102_categorical_stats_with_na(self):
seed(987654321)
Expand All @@ -59,6 +75,13 @@ def test_102_categorical_stats_with_na(self):
input_array[28] = nan
output = """
Overall Statistics
------------------
Total = 50
Number of Groups = 16
Statistics
----------
Expand All @@ -82,22 +105,23 @@ def test_102_categorical_stats_with_na(self):
6 1 2.0000 abcdefghijkl """
test = CategoricalStatistics(input_array, display=False)
self.assertEqual(str(test), output)
self.assertListEqual(test.results, [{'Frequency': 6, 'Category': 'abcdefghijklmnop', 'Rank': 1, 'Percent': 12.},
{'Frequency': 5, 'Category': 'abc', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 5, 'Category': 'abcdefg', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 5, 'Category': 'abcdefghijk', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 4, 'Category': 'abcdefgh', 'Rank': 3, 'Percent': 8.0},
{'Frequency': 4, 'Category': nan, 'Rank': 3, 'Percent': 8.0},
{'Frequency': 3, 'Category': 'a', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'ab', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'abcdefghij', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'abcdefghijklm', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 2, 'Category': 'abcde', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 2, 'Category': 'abcdefghi', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 2, 'Category': 'abcdefghijklmno', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 1, 'Category': 'abcd', 'Rank': 6, 'Percent': 2.0},
{'Frequency': 1, 'Category': 'abcdef', 'Rank': 6, 'Percent': 2.0},
{'Frequency': 1, 'Category': 'abcdefghijkl', 'Rank': 6, 'Percent': 2.0}])
self.assertDictEqual(test.results[0], {'Total': 50, 'Number of Groups': 16})
self.assertListEqual(test.results[1], [{'Frequency': 6, 'Category': 'abcdefghijklmnop', 'Rank': 1, 'Percent': 12.},
{'Frequency': 5, 'Category': 'abc', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 5, 'Category': 'abcdefg', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 5, 'Category': 'abcdefghijk', 'Rank': 2, 'Percent': 10.0},
{'Frequency': 4, 'Category': 'abcdefgh', 'Rank': 3, 'Percent': 8.0},
{'Frequency': 4, 'Category': nan, 'Rank': 3, 'Percent': 8.0},
{'Frequency': 3, 'Category': 'a', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'ab', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'abcdefghij', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 3, 'Category': 'abcdefghijklm', 'Rank': 4, 'Percent': 6.0},
{'Frequency': 2, 'Category': 'abcde', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 2, 'Category': 'abcdefghi', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 2, 'Category': 'abcdefghijklmno', 'Rank': 5, 'Percent': 4.0},
{'Frequency': 1, 'Category': 'abcd', 'Rank': 6, 'Percent': 2.0},
{'Frequency': 1, 'Category': 'abcdef', 'Rank': 6, 'Percent': 2.0},
{'Frequency': 1, 'Category': 'abcdefghijkl', 'Rank': 6, 'Percent': 2.0}])

def test_103_no_data(self):
input_array = None
Expand All @@ -111,6 +135,13 @@ def test_104_no_data_except_nan(self):
input_array = ['a', 'b', 'a', 'c', 'c', 'd']
output = """
Overall Statistics
------------------
Total = 6
Number of Groups = 5
Statistics
----------
Expand All @@ -123,6 +154,12 @@ def test_104_no_data_except_nan(self):
2 0 0.0000 w """
test = CategoricalStatistics(input_array, order=['z', 'y', 'x', 'w'], display=False)
self.assertEqual(str(test), output)
self.assertDictEqual(test.results[0], {'Total': 6, 'Number of Groups': 5})
self.assertListEqual(test.results[1], [{'Rank': 1, 'Frequency': 6, 'Percent': 100, 'Category': nan},
{'Rank': 2, 'Frequency': 0, 'Percent': 0, 'Category': 'z'},
{'Rank': 2, 'Frequency': 0, 'Percent': 0, 'Category': 'y'},
{'Rank': 2, 'Frequency': 0, 'Percent': 0, 'Category': 'x'},
{'Rank': 2, 'Frequency': 0, 'Percent': 0, 'Category': 'w'}])

def test_105_too_many_categories_warning(self):
input_array = [str(x) for x in range(100)]
Expand Down

0 comments on commit 523e5dc

Please sign in to comment.