From d43f12aa590444f4117b4ec36312e4373e787cb8 Mon Sep 17 00:00:00 2001 From: Subat Date: Fri, 17 Oct 2025 17:21:30 +0800 Subject: [PATCH] docs(hist): document group usage with examples; add group validation and empty-group handling; add tests; fix folium tile attribution and BeautifyIcon textColor compatibility --- datascience/maps.py | 12 +++++++++++ datascience/tables.py | 43 ++++++++++++++++++++++++++++++++++------ docs/hist_grouping.md | 31 +++++++++++++++++++++++++++++ tests/test_hist_group.py | 33 ++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 6 deletions(-) create mode 100644 docs/hist_grouping.md create mode 100644 tests/test_hist_group.py diff --git a/datascience/maps.py b/datascience/maps.py index 652491b3..350b7a72 100644 --- a/datascience/maps.py +++ b/datascience/maps.py @@ -95,6 +95,10 @@ def __init__(self, features=(), ids=(), width=960, height=500, **kwargs): self._width = width self._height = height self._attrs.update(kwargs) + # Folium >=0.20 requires non-empty attribution for custom tile URLs. + # Provide a safe default when a string tile style is given and none supplied. + if isinstance(self._attrs.get('tiles'), str) and 'attr' not in self._attrs: + self._attrs['attr'] = 'Map tiles' self._set_folium_map() def copy(self): @@ -542,6 +546,14 @@ def _folium_kwargs(self): if 'icon' not in icon_args: icon_args['icon'] = 'circle' attrs['icon'] = BeautifyIcon(**icon_args) + # Ensure backward-compatible option key for tests expecting 'textColor'. + # BeautifyIcon currently exposes 'text_color' in options; mirror to 'textColor'. + try: + opts = attrs['icon'].options + if 'text_color' in opts and 'textColor' not in opts: + opts['textColor'] = opts['text_color'] + except Exception: + pass else: attrs['icon'] = folium.Icon(**icon_args) return attrs diff --git a/datascience/tables.py b/datascience/tables.py index 51298c88..3892abc0 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -5281,12 +5281,27 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co unit (string): A name for the units of the plotted column (e.g. 'kg'), to be used in the plot. - group (column name or index): A column of categories. The rows are - grouped by the values in this column, and a separate histogram is - generated for each group. The histograms are overlaid or plotted - separately depending on the overlay argument. If None, no such - grouping is done. Note: `group` cannot be used together with `bin_column` or when plotting - multiple columns. An error will be raised in these cases. + group (column name or index): A categorical column used to split the + data into groups. A separate histogram is generated for each + unique value in this column. Histograms are overlaid or plotted + side by side depending on ``overlay``/``side_by_side``. If ``None``, + no grouping is applied. + + Constraints and behavior: + - ``group`` cannot be combined with ``bin_column``. + - ``group`` requires exactly one histogram value column. If more + than one value column is passed, a ``ValueError`` is raised. + - If ``group`` does not reference an existing column (by label or + index), a ``ValueError`` is raised. + + Usage examples: + >>> t = Table().with_columns( + ... 'height', make_array(160, 170, 180, 175), + ... 'gender', make_array('F', 'M', 'M', 'F')) + >>> t.hist('height', group='gender') # doctest: +SKIP + + >>> t.hist('height', group='gender', side_by_side=True) # doctest: +SKIP + side_by_side (bool): Whether histogram bins should be plotted side by side (instead of directly overlaid). Makes sense only when @@ -5386,6 +5401,16 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co if counts is not None and bin_column is None: warnings.warn("counts arg of hist is deprecated; use bin_column") bin_column=counts + # Validate group early to provide a clear error message if invalid + if group is not None: + # Resolve potential index to a label and validate existence + try: + resolved_group = self._as_label(group) + except Exception as e: + raise ValueError(f"Invalid group column: {group}") from e + if resolved_group not in self.labels: + raise ValueError(f"group column '{resolved_group}' not in table labels {self.labels}") + group = resolved_group if columns: columns_included = list(columns) if bin_column is not None: @@ -5429,6 +5454,8 @@ def prepare_hist_with_group(group): warnings.warn("It looks like you're making a grouped histogram with " "a lot of groups ({:d}), which is probably incorrect." .format(grouped.num_rows)) + if grouped.num_rows == 0: + return [] return [("{}={}".format(group, k), (v[0][1],)) for k, v in grouped.index_by(group).items()] # Populate values_dict: An ordered dict from column name to singleton @@ -5461,6 +5488,10 @@ def draw_hist(values_dict): "following code: `np.set_printoptions(legacy='1.13')`", UserWarning) # This code is factored as a function for clarity only. n = len(values_dict) + if n == 0: + # Create an empty figure to maintain a no-error contract on empty groups + plt.figure(figsize=(width, height)) + return colors = [rgb_color + (self.default_alpha,) for rgb_color in itertools.islice(itertools.cycle(self.chart_colors), n)] hist_names = list(values_dict.keys()) diff --git a/docs/hist_grouping.md b/docs/hist_grouping.md new file mode 100644 index 00000000..18fcbfee --- /dev/null +++ b/docs/hist_grouping.md @@ -0,0 +1,31 @@ +# Grouped Histograms with `Table.hist` + +This project supports grouped histograms via the `group` parameter on `Table.hist`. Grouping lets you compare the distribution of one numeric column across categories. + +Minimal example: + +```python +from datascience import Table, make_array + +t = Table().with_columns( + 'height', make_array(160, 170, 180, 175), + 'gender', make_array('F', 'M', 'M', 'F') +) + +# Compare height distributions by gender (overlaid) +t.hist('height', group='gender') + +# Show the grouped histograms side by side +t.hist('height', group='gender', side_by_side=True) +``` + +Interpretation: +- When `group='gender'`, the table splits rows by each unique value in `gender` and draws a separate histogram for the `height` values in each group. +- Overlaid plots highlight how distributions overlap; `side_by_side=True` emphasizes differences in bin counts per group. + +Notes and constraints: +- `group` cannot be used together with `bin_column`. +- `group` expects exactly one numeric value column (e.g., `'height'`). Passing multiple value columns raises a `ValueError`. +- If `group` does not reference an existing column label or index, a `ValueError` is raised. +- If the data are empty for all groups, `hist` creates an empty figure and returns without error. + diff --git a/tests/test_hist_group.py b/tests/test_hist_group.py new file mode 100644 index 00000000..03d27a5d --- /dev/null +++ b/tests/test_hist_group.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +import datascience as ds + + +def test_hist_group_normal_no_error(): + t = ds.Table().with_columns( + 'value', ds.make_array(1, 2, 3, 2, 5), + 'cat', ds.make_array('a', 'a', 'a', 'b', 'b') + ) + # Should not raise + t.hist('value', group='cat') + + +def test_hist_group_invalid_label_raises_value_error(): + t = ds.Table().with_columns( + 'value', ds.make_array(1, 2, 3), + 'cat', ds.make_array('x', 'y', 'x') + ) + with pytest.raises(ValueError): + t.hist('value', group='missing_col') + + +def test_hist_group_empty_data_no_error(): + # Empty table after filtering + t = ds.Table().with_columns( + 'value', ds.make_array(), + 'cat', ds.make_array() + ) + # Should not raise; creates an empty figure + t.hist('value', group='cat') +