Merge remote-tracking branch 'upstream/develop' into feature/GREAT-73…

…5/GREAT-597/alexsherstinsky/rule_based_profiler/add_support_for_returning_parameters_and_metrics_as_rule_output-2022_04_06-83
great-expectations · Apr 6, 2022 · 4d8458b · 4d8458b
2 parents 5dc633e + fbfe8bd
commit 4d8458b
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 150 deletions.
diff --git a/great_expectations/rule_based_profiler/helpers/util.py b/great_expectations/rule_based_profiler/helpers/util.py
@@ -415,69 +415,11 @@ def compute_bootstrap_quantiles_point_estimate(
     false_positive_rate: np.float64,
     n_resamples: int,
     random_seed: Optional[int] = None,
-) -> Tuple[Number, Number]:
-    """The winner of our performance testing is selected from the possible candidates:
-    - _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method
-    - _compute_bootstrap_quantiles_point_estimate_custom_mean_method
-    - _compute_bootstrap_quantiles_point_estimate_scipy_confidence_interval_midpoint_method"""
-    return _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
-        metric_values=metric_values,
-        false_positive_rate=false_positive_rate,
-        n_resamples=n_resamples,
-        random_seed=random_seed,
-    )
-
-
-def _compute_bootstrap_quantiles_point_estimate_custom_mean_method(
-    metric_values: np.ndarray,
-    false_positive_rate: np.float64,
-    n_resamples: int,
-    random_seed: Optional[int] = None,
 ) -> Tuple[Number, Number]:
     """
-    An internal implementation of the "bootstrap" estimator method, returning a point estimate for a population
-    parameter of interest (lower and upper quantiles in this case). See
-    https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for an introduction to "bootstrapping" in statistics.
-
-    This implementation has been replaced by "_compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method"
-    and only remains to demonstrate the performance improvement achieved by correcting for bias. Upon the implementation
-    of a Machine Learning Lifecycle framework, the performance improvement can be documented and this legacy method can
-    be removed from the codebase.
-    """
-    if random_seed:
-        random_state: np.random.Generator = np.random.Generator(
-            np.random.PCG64(random_seed)
-        )
-        bootstraps: np.ndarray = random_state.choice(
-            metric_values, size=(n_resamples, metric_values.size)
-        )
-    else:
-        bootstraps: np.ndarray = np.random.choice(
-            metric_values, size=(n_resamples, metric_values.size)
-        )
-
-    lower_quantiles: Union[np.ndarray, Number] = np.quantile(
-        bootstraps,
-        q=false_positive_rate / 2,
-        axis=1,
-    )
-    lower_quantile_point_estimate: Number = np.mean(lower_quantiles)
-    upper_quantiles: Union[np.ndarray, Number] = np.quantile(
-        bootstraps,
-        q=1.0 - (false_positive_rate / 2),
-        axis=1,
-    )
-    upper_quantile_point_estimate: Number = np.mean(upper_quantiles)
-    return lower_quantile_point_estimate, upper_quantile_point_estimate
+    ML Flow Experiment: parameter_builders_bootstrap/bootstrap_quantiles
+    ML Flow Experiment ID: 4129654509298109
 
-
-def _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
-    metric_values: np.ndarray,
-    false_positive_rate: np.float64,
-    n_resamples: int,
-    random_seed: Optional[int] = None,
-) -> Tuple[Number, Number]:
-    """
     An internal implementation of the "bootstrap" estimator method, returning a point estimate for a population
     parameter of interest (lower and upper quantiles in this case). See
     https://en.wikipedia.org/wiki/Bootstrapping_(statistics) for an introduction to "bootstrapping" in statistics.
@@ -604,92 +546,3 @@ def _compute_bootstrap_quantiles_point_estimate_custom_bias_corrected_method(
         lower_quantile_bias_corrected_point_estimate,
         upper_quantile_bias_corrected_point_estimate,
     )
-
-
-def _compute_bootstrap_quantiles_point_estimate_scipy_confidence_interval_midpoint_method(
-    metric_values: np.ndarray,
-    false_positive_rate: np.float64,
-    n_resamples: int,
-    method: Optional[str] = "BCa",
-    random_seed: Optional[int] = None,
-):
-    """
-    SciPy implementation of the BCa confidence interval for the population quantile. Unfortunately, as of
-    March 4th, 2022, this implementation has two issues:
-        1) it only returns a confidence interval and not a point estimate for the population parameter of interest,
-           which is what we require for our use cases (the attempt below tries to "back out" the statistic from the
-           confidece interval by taking the midpoint of the interval).
-        2) It can not handle multi-dimensional statistics and correct for bias simultaneously. You must either use
-           one feature or the other.
-
-    This implementation could only be used if Great Expectations drops support for Python 3.6, thereby enabling us
-    to use a more up-to-date version of the "scipy" Python package (the currently used version does not have
-    "bootstrap"). Also, as discussed above, two contributions would need to be made to the SciPy package to enable
-    1) bias correction for multi-dimensional statistics and 2) a return value of a point estimate for the population
-    parameter of interest (lower and upper quantiles in this case).
-    """
-    bootstraps: tuple = (metric_values,)  # bootstrap samples must be in a sequence
-
-    if random_seed:
-        random_state = np.random.Generator(np.random.PCG64(random_seed))
-    else:
-        random_state = None
-
-    lower_quantile_bootstrap_result: stats._bootstrap.BootstrapResult = stats.bootstrap(
-        bootstraps,
-        lambda data: np.quantile(
-            data,
-            q=false_positive_rate / 2,
-        ),
-        vectorized=False,
-        confidence_level=1.0 - false_positive_rate,
-        n_resamples=n_resamples,
-        method=method,
-        random_state=random_state,
-    )
-    upper_quantile_bootstrap_result: stats._bootstrap.BootstrapResult = stats.bootstrap(
-        bootstraps,
-        lambda data: np.quantile(
-            data,
-            q=1.0 - (false_positive_rate / 2),
-        ),
-        vectorized=False,
-        confidence_level=1.0 - false_positive_rate,
-        n_resamples=n_resamples,
-        method=method,
-        random_state=random_state,
-    )
-
-    # The idea that we can take the midpoint of the confidence interval is based on the fact that we think the
-    # confidence interval was built from a symmetrical distribution. We think the distribution is normal due to
-    # the implications of the Central Limit Theorem (CLT) (https://en.wikipedia.org/wiki/Central_limit_theorem) on
-    # the bootstrap samples.
-    # Unfortunately, the assumption that the CLT applies, does not hold in all cases. The bias-corrected and accelerated
-    # (BCa) confidence interval computed using scipy.stats.bootstrap attempts to compute the "acceleration" as a
-    # correction, because the standard normal approximation (CLT) assumes that the standard error of the bootstrap
-    # quantiles (theta-hat) is the same for all parameters (theta). The acceleration (which is the rate of change of
-    # the standard error of the quantile point estimate) is not a perfect correction, and therefore the assumption that
-    # this interval is built from a normal distribution does not always hold.
-    # See:
-    #
-    # Efron, B., & Tibshirani, R. J. (1993). The BCa method. An Introduction to the Bootstrap (pp. 184-188).
-    #     Springer Science and Business Media Dordrecht. DOI 10.1007/978-1-4899-4541-9
-    #
-    # For an in-depth look at how the BCa interval is constructed and you will find the points made above on page 186.
-    lower_quantile_confidence_interval: stats._bootstrap.BootstrapResult.ConfidenceInterval = (
-        lower_quantile_bootstrap_result.confidence_interval
-    )
-    lower_quantile_point_estimate: np.float64 = np.mean(
-        [
-            lower_quantile_confidence_interval.low,
-            lower_quantile_confidence_interval.high,
-        ]
-    )
-    upper_quantile_confidence_interal: stats._bootstrap.BootstrapResult.ConfidenceInterval = (
-        upper_quantile_bootstrap_result.confidence_interval
-    )
-    upper_quantile_point_estimate: np.float64 = np.mean(
-        [upper_quantile_confidence_interal.low, upper_quantile_confidence_interal.high]
-    )
-
-    return lower_quantile_point_estimate, upper_quantile_point_estimate
diff --git a/...s/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py b/...s/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py
@@ -1,6 +1,7 @@
 from ruamel import yaml
 
 from great_expectations import DataContext
+from great_expectations.core import ExpectationSuite
 from great_expectations.rule_based_profiler.rule_based_profiler import RuleBasedProfiler
 
 profiler_config = r"""
@@ -112,7 +113,10 @@
     data_context=data_context,
 )
 
-suite = rule_based_profiler.run(expectation_suite_name="test_suite_name")
+rule_based_profiler.run()
+suite: ExpectationSuite = rule_based_profiler.expectation_suite(
+    expectation_suite_name="test_suite_name"
+)
 print(suite)
 
 # Please note that this docstring is here to demonstrate output for docs. It is not needed for normal use.