diff --git a/deepchecks/tabular/checks/distribution/train_test_feature_drift.py b/deepchecks/tabular/checks/distribution/train_test_feature_drift.py index adfc461d93..5abda7e719 100644 --- a/deepchecks/tabular/checks/distribution/train_test_feature_drift.py +++ b/deepchecks/tabular/checks/distribution/train_test_feature_drift.py @@ -49,6 +49,10 @@ class TrainTestFeatureDrift(TrainTestCheck): sort_feature_by : str , default: feature importance Indicates how features will be sorted. Can be either "feature importance" or "drift score" + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for categorical columns. Max number of allowed categories. If there are more, they are binned into an "Other" category. If None, there is no limit. @@ -74,6 +78,7 @@ def __init__( ignore_columns: Union[Hashable, List[Hashable], None] = None, n_top_columns: int = 5, sort_feature_by: str = 'feature importance', + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -85,6 +90,7 @@ def __init__( super().__init__(**kwargs) self.columns = columns self.ignore_columns = ignore_columns + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift ' @@ -159,6 +165,7 @@ def run_logic(self, context: Context) -> CheckResult: value_name=column, column_type=column_type, plot_title=plot_title, + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by diff --git a/deepchecks/tabular/checks/distribution/train_test_label_drift.py b/deepchecks/tabular/checks/distribution/train_test_label_drift.py index aa18f680c3..c3237a8181 100644 --- a/deepchecks/tabular/checks/distribution/train_test_label_drift.py +++ b/deepchecks/tabular/checks/distribution/train_test_label_drift.py @@ -34,6 +34,10 @@ class TrainTestLabelDrift(TrainTestCheck): Parameters ---------- + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for categorical columns. Max number of allowed categories. If there are more, they are binned into an "Other" category. If None, there is no limit. @@ -51,6 +55,7 @@ class TrainTestLabelDrift(TrainTestCheck): def __init__( self, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -58,6 +63,7 @@ def __init__( **kwargs ): super().__init__(**kwargs) + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( 'max_num_categories is deprecated. please use max_num_categories_for_drift and ' @@ -87,6 +93,7 @@ def run_logic(self, context: Context) -> CheckResult: test_column=test_dataset.label_col, value_name=train_dataset.label_name, column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical', + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by diff --git a/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py b/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py index 513e9e876d..7793d0442c 100644 --- a/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py +++ b/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py @@ -37,6 +37,10 @@ class TrainTestPredictionDrift(TrainTestCheck): Parameters ---------- + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for categorical columns. Max number of allowed categories. If there are more, they are binned into an "Other" category. If None, there is no limit. @@ -54,6 +58,7 @@ class TrainTestPredictionDrift(TrainTestCheck): def __init__( self, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -61,6 +66,7 @@ def __init__( **kwargs ): super().__init__(**kwargs) + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift ' @@ -94,6 +100,7 @@ def run_logic(self, context: Context) -> CheckResult: test_column=pd.Series(test_prediction), value_name='model predictions', column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical', + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by diff --git a/deepchecks/utils/distribution/drift.py b/deepchecks/utils/distribution/drift.py index 77d55e9354..55f1fc860d 100644 --- a/deepchecks/utils/distribution/drift.py +++ b/deepchecks/utils/distribution/drift.py @@ -9,7 +9,7 @@ # ---------------------------------------------------------------------------- # """Common utilities for distribution checks.""" - +from numbers import Number from typing import Callable, Hashable, Optional, Tuple, Union import numpy as np @@ -59,7 +59,8 @@ def psi(expected_percents: np.ndarray, actual_percents: np.ndarray): return psi_value -def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series]): +def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series], + margin_quantile_filter: float): """ Calculate the Earth Movers Distance (Wasserstein distance). @@ -69,27 +70,42 @@ def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.n Parameters ---------- - dist1 : Union[np.ndarray, pd.Series] + dist1: Union[np.ndarray, pd.Series] array of numberical values. - dist2 : Union[np.ndarray, pd.Series] + dist2: Union[np.ndarray, pd.Series] array of numberical values to compare dist1 to. + margin_quantile_filter: float + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. Returns ------- Any the Wasserstein distance between the two distributions. + Raises + ------- + DeepchecksValueError + if the value of margin_quantile_filter is not in range [0, 0.5) + """ - unique1 = np.unique(dist1) - unique2 = np.unique(dist2) + if not isinstance(margin_quantile_filter, Number) or margin_quantile_filter < 0 or margin_quantile_filter >= 0.5: + raise DeepchecksValueError( + f'margin_quantile_filter expected a value in range [0, 0.5), instead got {margin_quantile_filter}') - sample_space = list(set(unique1).union(set(unique2))) + if margin_quantile_filter != 0: + dist1_qt_min, dist1_qt_max = np.quantile(dist1, [margin_quantile_filter, 1-margin_quantile_filter]) + dist2_qt_min, dist2_qt_max = np.quantile(dist2, [margin_quantile_filter, 1-margin_quantile_filter]) + dist1 = dist1[(dist1_qt_max >= dist1) & (dist1 >= dist1_qt_min)] + dist2 = dist2[(dist2_qt_max >= dist2) & (dist2 >= dist2_qt_min)] - val_max = max(sample_space) - val_min = min(sample_space) + val_max = np.max([np.max(dist1), np.max(dist2)]) + val_min = np.min([np.min(dist1), np.min(dist2)]) if val_max == val_min: return 0 + # Scale the distribution between 0 and 1: dist1 = (dist1 - val_min) / (val_max - val_min) dist2 = (dist2 - val_min) / (val_max - val_min) @@ -101,6 +117,7 @@ def calc_drift_and_plot(train_column: pd.Series, value_name: Hashable, column_type: str, plot_title: Optional[str] = None, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -120,6 +137,10 @@ def calc_drift_and_plot(train_column: pd.Series, type of column (either "numerical" or "categorical") plot_title: str or None if None use value_name as title otherwise use this. + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Max number of allowed categories. If there are more, they are binned into an "Other" category. max_num_categories_for_display: int, default: 10 @@ -152,7 +173,7 @@ def calc_drift_and_plot(train_column: pd.Series, train_dist = train_dist.astype('float') test_dist = test_dist.astype('float') - score = earth_movers_distance(dist1=train_dist, dist2=test_dist) + score = earth_movers_distance(dist1=train_dist, dist2=test_dist, margin_quantile_filter=margin_quantile_filter) bar_traces, bar_x_axis, bar_y_axis = drift_score_bar_traces(score) dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(train_dist, test_dist, value_name) diff --git a/deepchecks/vision/checks/distribution/image_property_drift.py b/deepchecks/vision/checks/distribution/image_property_drift.py index 235fb717fb..a4636fbc89 100644 --- a/deepchecks/vision/checks/distribution/image_property_drift.py +++ b/deepchecks/vision/checks/distribution/image_property_drift.py @@ -45,6 +45,10 @@ class ImagePropertyDrift(TrainTestCheck): List of properties. Replaces the default deepchecks properties. Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str), representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete' + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for non-continuous properties. Max number of allowed categories. If there are more, they are binned into an "Other" category. If None, there is no limit. @@ -68,6 +72,7 @@ class ImagePropertyDrift(TrainTestCheck): def __init__( self, image_properties: t.List[t.Dict[str, t.Any]] = None, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -83,6 +88,7 @@ def __init__( else: self.image_properties = default_image_properties + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift ' @@ -181,6 +187,7 @@ def compute(self, context: Context) -> CheckResult: test_column=df_test[property_name], value_name=property_name, column_type=get_column_type(single_property['output_type']), + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by, diff --git a/deepchecks/vision/checks/distribution/train_test_label_drift.py b/deepchecks/vision/checks/distribution/train_test_label_drift.py index 8ed421f71e..786c662baa 100644 --- a/deepchecks/vision/checks/distribution/train_test_label_drift.py +++ b/deepchecks/vision/checks/distribution/train_test_label_drift.py @@ -60,6 +60,10 @@ class TrainTestLabelDrift(TrainTestCheck): List of properties. Replaces the default deepchecks properties. Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str), representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id' + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for non-continuous properties. Max number of allowed categories. If there are more, they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies @@ -79,6 +83,7 @@ class TrainTestLabelDrift(TrainTestCheck): def __init__( self, label_properties: List[Dict[str, Any]] = None, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -90,6 +95,7 @@ def __init__( if label_properties is not None: validate_properties(label_properties) self.user_label_properties = label_properties + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift ' @@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult: test_column=pd.Series(self._test_label_properties[name]), value_name=name, column_type=get_column_type(output_type), + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by diff --git a/deepchecks/vision/checks/distribution/train_test_prediction_drift.py b/deepchecks/vision/checks/distribution/train_test_prediction_drift.py index 6ba93b6825..66d60b38d2 100644 --- a/deepchecks/vision/checks/distribution/train_test_prediction_drift.py +++ b/deepchecks/vision/checks/distribution/train_test_prediction_drift.py @@ -63,6 +63,10 @@ class TrainTestPredictionDrift(TrainTestCheck): List of properties. Replaces the default deepchecks properties. Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str), representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id' + margin_quantile_filter: float, default: 0 + float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered + out of the EMD calculation. This is done in order for extreme values not to affect the calculation + disproportionally. This filter is applied to both distributions, in both margins. max_num_categories_for_drift: int, default: 10 Only for non-continues columns. Max number of allowed categories. If there are more, they are binned into an "Other" category. If None, there is no limit. @@ -81,6 +85,7 @@ class TrainTestPredictionDrift(TrainTestCheck): def __init__( self, prediction_properties: List[Dict[str, Any]] = None, + margin_quantile_filter: float = 0, max_num_categories_for_drift: int = 10, max_num_categories_for_display: int = 10, show_categories_by: str = 'train_largest', @@ -92,6 +97,7 @@ def __init__( if prediction_properties is not None: validate_properties(prediction_properties) self.user_prediction_properties = prediction_properties + self.margin_quantile_filter = margin_quantile_filter if max_num_categories is not None: warnings.warn( f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift ' @@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult: test_column=pd.Series(self._test_prediction_properties[name]), value_name=name, column_type=get_column_type(output_type), + margin_quantile_filter=self.margin_quantile_filter, max_num_categories_for_drift=self.max_num_categories_for_drift, max_num_categories_for_display=self.max_num_categories_for_display, show_categories_by=self.show_categories_by diff --git a/tests/utils/drift_test.py b/tests/utils/drift_test.py new file mode 100644 index 0000000000..c4e420b636 --- /dev/null +++ b/tests/utils/drift_test.py @@ -0,0 +1,56 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""Test drift utils""" +from hamcrest import assert_that, equal_to, raises, close_to, calling + +from deepchecks.core.errors import DeepchecksValueError +from deepchecks.utils.distribution.drift import earth_movers_distance + +import numpy as np + + +def test_emd(): + dist1 = np.ones(100) + dist2 = np.zeros(100) + res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0) + assert_that(res, equal_to(1)) + + +def test_real_input(): + # Move half of the dirt (0-50) to 2/3 of the distance (100-150) with the middle (50-100) staying unmoved. + # Therefore, result should be 1/2 * 2/3 = 1/3 + dist1 = np.array(range(100)) + dist2 = np.array(range(50, 150)) + res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0) + assert_that(res, close_to(0.33, 0.01)) + + +def test_emd_scaling(): + dist1 = np.ones(100) * 10 + dist2 = np.zeros(100) + res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0) + assert_that(res, equal_to(1)) + + +def test_emd_margin_filter(): + dist1 = np.concatenate([np.ones(99) * 10, np.ones(1) * 100]) + dist2 = np.concatenate([np.zeros(99), np.ones(1)]) + res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0.01) + assert_that(res, equal_to(1)) + + +def test_emd_raises_exception(): + dist1 = np.ones(100) + dist2 = np.zeros(100) + assert_that( + calling(earth_movers_distance).with_args(dist1, dist2, -1), + raises(DeepchecksValueError, r'margin_quantile_filter expected a value in range \[0, 0.5\), instead got -1') + )