From 2c3e541cb6414476cb33a5505c7c51af6ff0e546 Mon Sep 17 00:00:00 2001 From: Adnan Hemani Date: Sun, 5 Jun 2022 03:15:41 -0500 Subject: [PATCH 1/2] explicitly render historgram to prevent rounding errors --- datascience/tables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datascience/tables.py b/datascience/tables.py index 6cfe1579..46d4622c 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -5439,6 +5439,7 @@ def draw_hist(values_dict): axis.set_xlabel(x_unit, fontsize=16) plt.legend(hist_names, loc=2, bbox_to_anchor=(1.05, 1)) type(self).plots.append(axis) + plt.show() else: _, axes = plt.subplots(n, 1, figsize=(width, height * n)) if 'bins' in vargs: @@ -5466,6 +5467,7 @@ def draw_hist(values_dict): axis.scatter(values_for_hist, np.zeros_like(values_for_hist), marker="|", color="black", s=100, zorder=10) type(self).plots.append(axis) + plt.show() draw_hist(values_dict) From 6e2990e41c8a4b82360d240c5deb0869d3df1dad Mon Sep 17 00:00:00 2001 From: Adnan Hemani Date: Mon, 6 Jun 2022 02:13:12 -0500 Subject: [PATCH 2/2] new version of code --- datascience/tables.py | 140 ++++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index 46d4622c..a97db7b1 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -32,6 +32,10 @@ _INTERACTIVE_PLOTS = False +# Set numpy printoptions to legacy to get around error terms, as described in +# https://github.com/data-8/datascience/issues/491 +np.set_printoptions(legacy='1.13') + class Table(collections.abc.MutableMapping): """A sequence of string-labeled columns.""" plots = collections.deque(maxlen=10) @@ -5393,81 +5397,83 @@ def prepare_hist_with_group(group): right_end = max([max(self.column(k)) for k in self.labels if np.issubdtype(self.column(k).dtype, np.number)]) def draw_hist(values_dict): - with np.printoptions(legacy='1.13'): - # This code is factored as a function for clarity only. - n = len(values_dict) - colors = [rgb_color + (self.default_alpha,) for rgb_color in - itertools.islice(itertools.cycle(self.chart_colors), n)] - hist_names = list(values_dict.keys()) - values = [v[0] for v in values_dict.values()] - weights = [v[1] for v in values_dict.values() if len(v) > 1] - if n > len(weights) > 0: - raise ValueError("Weights were provided for some columns, but not " - " all, and that's not supported.") - if rug and overlay and n > 1: - warnings.warn("Cannot plot overlaid rug plots; rug=True ignored", UserWarning) + # Check if np.printoptions is set to legacy. Throw UserWarning if not + if np.get_printoptions()['legacy'] != '1.13': + warnings.warn("We've detected you're not using the '1.13' legacy setting for `np.printoptions`. " + "This may cause excessive error terms in your plots. We recommend solving this by running the " + "following code: `np.set_printoptions(legacy='1.13')`", UserWarning) + # This code is factored as a function for clarity only. + n = len(values_dict) + colors = [rgb_color + (self.default_alpha,) for rgb_color in + itertools.islice(itertools.cycle(self.chart_colors), n)] + hist_names = list(values_dict.keys()) + values = [v[0] for v in values_dict.values()] + weights = [v[1] for v in values_dict.values() if len(v) > 1] + if n > len(weights) > 0: + raise ValueError("Weights were provided for some columns, but not " + " all, and that's not supported.") + if rug and overlay and n > 1: + warnings.warn("Cannot plot overlaid rug plots; rug=True ignored", UserWarning) + if vargs['density']: + y_label = 'Percent per ' + (unit if unit else 'unit') + percentage = plt.FuncFormatter(lambda x, _: "{:g}".format(100*x)) + else: + y_label = 'Count' + + if overlay and n > 1: + # Reverse because legend prints bottom-to-top + values = values[::-1] + weights = weights[::-1] + colors = list(colors)[::-1] + if len(weights) == n: + vargs['weights'] = weights + if not side_by_side: + vargs.setdefault('histtype', 'stepfilled') + figure = plt.figure(figsize=(width, height)) + plt.hist(values, color=colors, **vargs) + # if rug: + # plt.scatter(values, np.zeros_like(values), marker="|", color=colors) + axis = figure.get_axes()[0] + _vertical_x(axis) + axis.set_ylabel(y_label) if vargs['density']: - y_label = 'Percent per ' + (unit if unit else 'unit') - percentage = plt.FuncFormatter(lambda x, _: "{:g}".format(100*x)) + axis.yaxis.set_major_formatter(percentage) + x_unit = ' (' + unit + ')' if unit else '' + if group is not None and len(self.labels) == 2: + #There's a grouping in place but we're only plotting one column's values + label_not_grouped = [l for l in self.labels if l != group][0] + axis.set_xlabel(label_not_grouped + x_unit, fontsize=16) else: - y_label = 'Count' - - if overlay and n > 1: - # Reverse because legend prints bottom-to-top - values = values[::-1] - weights = weights[::-1] - colors = list(colors)[::-1] - if len(weights) == n: - vargs['weights'] = weights - if not side_by_side: + axis.set_xlabel(x_unit, fontsize=16) + plt.legend(hist_names, loc=2, bbox_to_anchor=(1.05, 1)) + type(self).plots.append(axis) + else: + _, axes = plt.subplots(n, 1, figsize=(width, height * n)) + if 'bins' in vargs: + bins = vargs['bins'] + if isinstance(bins, numbers.Integral) and bins > 76 or hasattr(bins, '__len__') and len(bins) > 76: + # Use stepfilled when there are too many bins vargs.setdefault('histtype', 'stepfilled') - figure = plt.figure(figsize=(width, height)) - plt.hist(values, color=colors, **vargs) - # if rug: - # plt.scatter(values, np.zeros_like(values), marker="|", color=colors) - axis = figure.get_axes()[0] - _vertical_x(axis) + if n == 1: + axes = [axes] + for i, (axis, hist_name, values_for_hist, color) in enumerate(zip(axes, hist_names, values, colors)): axis.set_ylabel(y_label) if vargs['density']: axis.yaxis.set_major_formatter(percentage) x_unit = ' (' + unit + ')' if unit else '' - if group is not None and len(self.labels) == 2: - #There's a grouping in place but we're only plotting one column's values - label_not_grouped = [l for l in self.labels if l != group][0] - axis.set_xlabel(label_not_grouped + x_unit, fontsize=16) - else: - axis.set_xlabel(x_unit, fontsize=16) - plt.legend(hist_names, loc=2, bbox_to_anchor=(1.05, 1)) + if len(weights) == n: + vargs['weights'] = weights[i] + axis.set_xlabel(hist_name + x_unit, fontsize=16) + heights, bins, patches = axis.hist(values_for_hist, color=color, **vargs) + if left_end is not None and right_end is not None: + x_shade, height_shade, width_shade = _compute_shading(heights, bins.copy(), left_end, right_end) + axis.bar(x_shade, height_shade, width=width_shade, + color=self.chart_colors[1], align="edge") + _vertical_x(axis) + if rug: + axis.scatter(values_for_hist, np.zeros_like(values_for_hist), marker="|", + color="black", s=100, zorder=10) type(self).plots.append(axis) - plt.show() - else: - _, axes = plt.subplots(n, 1, figsize=(width, height * n)) - if 'bins' in vargs: - bins = vargs['bins'] - if isinstance(bins, numbers.Integral) and bins > 76 or hasattr(bins, '__len__') and len(bins) > 76: - # Use stepfilled when there are too many bins - vargs.setdefault('histtype', 'stepfilled') - if n == 1: - axes = [axes] - for i, (axis, hist_name, values_for_hist, color) in enumerate(zip(axes, hist_names, values, colors)): - axis.set_ylabel(y_label) - if vargs['density']: - axis.yaxis.set_major_formatter(percentage) - x_unit = ' (' + unit + ')' if unit else '' - if len(weights) == n: - vargs['weights'] = weights[i] - axis.set_xlabel(hist_name + x_unit, fontsize=16) - heights, bins, patches = axis.hist(values_for_hist, color=color, **vargs) - if left_end is not None and right_end is not None: - x_shade, height_shade, width_shade = _compute_shading(heights, bins.copy(), left_end, right_end) - axis.bar(x_shade, height_shade, width=width_shade, - color=self.chart_colors[1], align="edge") - _vertical_x(axis) - if rug: - axis.scatter(values_for_hist, np.zeros_like(values_for_hist), marker="|", - color="black", s=100, zorder=10) - type(self).plots.append(axis) - plt.show() draw_hist(values_dict)