Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into feature/GREAT-73…
Browse files Browse the repository at this point in the history
…5/GREAT-597/alexsherstinsky/rule_based_profiler/add_support_for_returning_parameters_and_metrics_as_rule_output-2022_04_06-83
  • Loading branch information
Alex Sherstinsky committed Apr 6, 2022
2 parents 5dc633e + fbfe8bd commit 4d8458b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 150 deletions.
151 changes: 2 additions & 149 deletions great_expectations/rule_based_profiler/helpers/util.py
Expand Up @@ -415,69 +415,11 @@ def compute_bootstrap_quantiles_point_estimate(
false_positive_rate: np.float64,
n_resamples: int,
random_seed: Optional[int] = None,
) -> Tuple[Number, Number]:
"""The winner of our performance testing is selected from the possible candidates:
- _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method
- _compute_bootstrap_quantiles_point_estimate_custom_mean_method
- _compute_bootstrap_quantiles_point_estimate_scipy_confidence_interval_midpoint_method"""
return _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
metric_values=metric_values,
false_positive_rate=false_positive_rate,
n_resamples=n_resamples,
random_seed=random_seed,
)


def _compute_bootstrap_quantiles_point_estimate_custom_mean_method(
metric_values: np.ndarray,
false_positive_rate: np.float64,
n_resamples: int,
random_seed: Optional[int] = None,
) -> Tuple[Number, Number]:
"""
An internal implementation of the "bootstrap" estimator method, returning a point estimate for a population
parameter of interest (lower and upper quantiles in this case). See
https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for an introduction to "bootstrapping" in statistics.
This implementation has been replaced by "_compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method"
and only remains to demonstrate the performance improvement achieved by correcting for bias. Upon the implementation
of a Machine Learning Lifecycle framework, the performance improvement can be documented and this legacy method can
be removed from the codebase.
"""
if random_seed:
random_state: np.random.Generator = np.random.Generator(
np.random.PCG64(random_seed)
)
bootstraps: np.ndarray = random_state.choice(
metric_values, size=(n_resamples, metric_values.size)
)
else:
bootstraps: np.ndarray = np.random.choice(
metric_values, size=(n_resamples, metric_values.size)
)

lower_quantiles: Union[np.ndarray, Number] = np.quantile(
bootstraps,
q=false_positive_rate / 2,
axis=1,
)
lower_quantile_point_estimate: Number = np.mean(lower_quantiles)
upper_quantiles: Union[np.ndarray, Number] = np.quantile(
bootstraps,
q=1.0 - (false_positive_rate / 2),
axis=1,
)
upper_quantile_point_estimate: Number = np.mean(upper_quantiles)
return lower_quantile_point_estimate, upper_quantile_point_estimate
ML Flow Experiment: parameter_builders_bootstrap/bootstrap_quantiles
ML Flow Experiment ID: 4129654509298109

def _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
metric_values: np.ndarray,
false_positive_rate: np.float64,
n_resamples: int,
random_seed: Optional[int] = None,
) -> Tuple[Number, Number]:
"""
An internal implementation of the "bootstrap" estimator method, returning a point estimate for a population
parameter of interest (lower and upper quantiles in this case). See
https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for an introduction to "bootstrapping" in statistics.
Expand Down Expand Up @@ -604,92 +546,3 @@ def _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
lower_quantile_bias_corrected_point_estimate,
upper_quantile_bias_corrected_point_estimate,
)


def _compute_bootstrap_quantiles_point_estimate_scipy_confidence_interval_midpoint_method(
metric_values: np.ndarray,
false_positive_rate: np.float64,
n_resamples: int,
method: Optional[str] = "BCa",
random_seed: Optional[int] = None,
):
"""
SciPy implementation of the BCa confidence interval for the population quantile. Unfortunately, as of
March 4th, 2022, this implementation has two issues:
1) it only returns a confidence interval and not a point estimate for the population parameter of interest,
which is what we require for our use cases (the attempt below tries to "back out" the statistic from the
confidece interval by taking the midpoint of the interval).
2) It can not handle multi-dimensional statistics and correct for bias simultaneously. You must either use
one feature or the other.
This implementation could only be used if Great Expectations drops support for Python 3.6, thereby enabling us
to use a more up-to-date version of the "scipy" Python package (the currently used version does not have
"bootstrap"). Also, as discussed above, two contributions would need to be made to the SciPy package to enable
1) bias correction for multi-dimensional statistics and 2) a return value of a point estimate for the population
parameter of interest (lower and upper quantiles in this case).
"""
bootstraps: tuple = (metric_values,) # bootstrap samples must be in a sequence

if random_seed:
random_state = np.random.Generator(np.random.PCG64(random_seed))
else:
random_state = None

lower_quantile_bootstrap_result: stats._bootstrap.BootstrapResult = stats.bootstrap(
bootstraps,
lambda data: np.quantile(
data,
q=false_positive_rate / 2,
),
vectorized=False,
confidence_level=1.0 - false_positive_rate,
n_resamples=n_resamples,
method=method,
random_state=random_state,
)
upper_quantile_bootstrap_result: stats._bootstrap.BootstrapResult = stats.bootstrap(
bootstraps,
lambda data: np.quantile(
data,
q=1.0 - (false_positive_rate / 2),
),
vectorized=False,
confidence_level=1.0 - false_positive_rate,
n_resamples=n_resamples,
method=method,
random_state=random_state,
)

# The idea that we can take the midpoint of the confidence interval is based on the fact that we think the
# confidence interval was built from a symmetrical distribution. We think the distribution is normal due to
# the implications of the Central Limit Theorem (CLT) (https://en.wikipedia.org/wiki/Central_limit_theorem) on
# the bootstrap samples.
# Unfortunately, the assumption that the CLT applies, does not hold in all cases. The bias-corrected and accelerated
# (BCa) confidence interval computed using scipy.stats.bootstrap attempts to compute the "acceleration" as a
# correction, because the standard normal approximation (CLT) assumes that the standard error of the bootstrap
# quantiles (theta-hat) is the same for all parameters (theta). The acceleration (which is the rate of change of
# the standard error of the quantile point estimate) is not a perfect correction, and therefore the assumption that
# this interval is built from a normal distribution does not always hold.
# See:
#
# Efron, B., & Tibshirani, R. J. (1993). The BCa method. An Introduction to the Bootstrap (pp. 184-188).
# Springer Science and Business Media Dordrecht. DOI 10.1007/978-1-4899-4541-9
#
# For an in-depth look at how the BCa interval is constructed and you will find the points made above on page 186.
lower_quantile_confidence_interval: stats._bootstrap.BootstrapResult.ConfidenceInterval = (
lower_quantile_bootstrap_result.confidence_interval
)
lower_quantile_point_estimate: np.float64 = np.mean(
[
lower_quantile_confidence_interval.low,
lower_quantile_confidence_interval.high,
]
)
upper_quantile_confidence_interal: stats._bootstrap.BootstrapResult.ConfidenceInterval = (
upper_quantile_bootstrap_result.confidence_interval
)
upper_quantile_point_estimate: np.float64 = np.mean(
[upper_quantile_confidence_interal.low, upper_quantile_confidence_interal.high]
)

return lower_quantile_point_estimate, upper_quantile_point_estimate
@@ -1,6 +1,7 @@
from ruamel import yaml

from great_expectations import DataContext
from great_expectations.core import ExpectationSuite
from great_expectations.rule_based_profiler.rule_based_profiler import RuleBasedProfiler

profiler_config = r"""
Expand Down Expand Up @@ -112,7 +113,10 @@
data_context=data_context,
)

suite = rule_based_profiler.run(expectation_suite_name="test_suite_name")
rule_based_profiler.run()
suite: ExpectationSuite = rule_based_profiler.expectation_suite(
expectation_suite_name="test_suite_name"
)
print(suite)

# Please note that this docstring is here to demonstrate output for docs. It is not needed for normal use.
Expand Down

0 comments on commit 4d8458b

Please sign in to comment.