Added margin_quantile_filter to EMD drift score (#1368)

* Added margin_quantile_filter to EMD and the checks that use it. Added tests for EMD. Right now changed the default setting of margin_quantile_filter to 0 (values do not change). * Fixed comment + added raises test that was forgotten * removed file
deepchecks · May 3, 2022 · 3f09614 · 3f09614
1 parent b708e14
commit 3f09614
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 10 deletions.
diff --git a/deepchecks/tabular/checks/distribution/train_test_feature_drift.py b/deepchecks/tabular/checks/distribution/train_test_feature_drift.py
@@ -49,6 +49,10 @@ class TrainTestFeatureDrift(TrainTestCheck):
     sort_feature_by : str , default: feature importance
         Indicates how features will be sorted. Can be either "feature importance"
         or "drift score"
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If None, there is no limit.
@@ -74,6 +78,7 @@ def __init__(
             ignore_columns: Union[Hashable, List[Hashable], None] = None,
             n_top_columns: int = 5,
             sort_feature_by: str = 'feature importance',
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
@@ -85,6 +90,7 @@ def __init__(
         super().__init__(**kwargs)
         self.columns = columns
         self.ignore_columns = ignore_columns
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
@@ -159,6 +165,7 @@ def run_logic(self, context: Context) -> CheckResult:
                 value_name=column,
                 column_type=column_type,
                 plot_title=plot_title,
+                margin_quantile_filter=self.margin_quantile_filter,
                 max_num_categories_for_drift=self.max_num_categories_for_drift,
                 max_num_categories_for_display=self.max_num_categories_for_display,
                 show_categories_by=self.show_categories_by

diff --git a/deepchecks/tabular/checks/distribution/train_test_label_drift.py b/deepchecks/tabular/checks/distribution/train_test_label_drift.py
@@ -34,6 +34,10 @@ class TrainTestLabelDrift(TrainTestCheck):
 
     Parameters
     ----------
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If None, there is no limit.
@@ -51,13 +55,15 @@ class TrainTestLabelDrift(TrainTestCheck):
 
     def __init__(
             self,
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
             max_num_categories: int = None,
             **kwargs
     ):
         super().__init__(**kwargs)
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 'max_num_categories is deprecated. please use max_num_categories_for_drift and '
@@ -87,6 +93,7 @@ def run_logic(self, context: Context) -> CheckResult:
             test_column=test_dataset.label_col,
             value_name=train_dataset.label_name,
             column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
+            margin_quantile_filter=self.margin_quantile_filter,
             max_num_categories_for_drift=self.max_num_categories_for_drift,
             max_num_categories_for_display=self.max_num_categories_for_display,
             show_categories_by=self.show_categories_by

diff --git a/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py b/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py
@@ -37,6 +37,10 @@ class TrainTestPredictionDrift(TrainTestCheck):
 
     Parameters
     ----------
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If None, there is no limit.
@@ -54,13 +58,15 @@ class TrainTestPredictionDrift(TrainTestCheck):
 
     def __init__(
             self,
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
             max_num_categories: int = None,  # Deprecated
             **kwargs
     ):
         super().__init__(**kwargs)
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
@@ -94,6 +100,7 @@ def run_logic(self, context: Context) -> CheckResult:
             test_column=pd.Series(test_prediction),
             value_name='model predictions',
             column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
+            margin_quantile_filter=self.margin_quantile_filter,
             max_num_categories_for_drift=self.max_num_categories_for_drift,
             max_num_categories_for_display=self.max_num_categories_for_display,
             show_categories_by=self.show_categories_by

diff --git a/deepchecks/utils/distribution/drift.py b/deepchecks/utils/distribution/drift.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Common utilities for distribution checks."""
-
+from numbers import Number
 from typing import Callable, Hashable, Optional, Tuple, Union
 
 import numpy as np
@@ -59,7 +59,8 @@ def psi(expected_percents: np.ndarray, actual_percents: np.ndarray):
     return psi_value
 
 
-def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series]):
+def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series],
+                          margin_quantile_filter: float):
     """
     Calculate the Earth Movers Distance (Wasserstein distance).
 
@@ -69,27 +70,42 @@ def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.n
 
     Parameters
     ----------
-    dist1 : Union[np.ndarray, pd.Series]
+    dist1: Union[np.ndarray, pd.Series]
         array of numberical values.
-    dist2 : Union[np.ndarray, pd.Series]
+    dist2: Union[np.ndarray, pd.Series]
         array of numberical values to compare dist1 to.
+    margin_quantile_filter: float
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     Returns
     -------
     Any
         the Wasserstein distance between the two distributions.
 
+    Raises
+    -------
+    DeepchecksValueError
+        if the value of margin_quantile_filter is not in range [0, 0.5)
+
     """
-    unique1 = np.unique(dist1)
-    unique2 = np.unique(dist2)
+    if not isinstance(margin_quantile_filter, Number) or margin_quantile_filter < 0 or margin_quantile_filter >= 0.5:
+        raise DeepchecksValueError(
+            f'margin_quantile_filter expected a value in range [0, 0.5), instead got {margin_quantile_filter}')
 
-    sample_space = list(set(unique1).union(set(unique2)))
+    if margin_quantile_filter != 0:
+        dist1_qt_min, dist1_qt_max = np.quantile(dist1, [margin_quantile_filter, 1-margin_quantile_filter])
+        dist2_qt_min, dist2_qt_max = np.quantile(dist2, [margin_quantile_filter, 1-margin_quantile_filter])
+        dist1 = dist1[(dist1_qt_max >= dist1) & (dist1 >= dist1_qt_min)]
+        dist2 = dist2[(dist2_qt_max >= dist2) & (dist2 >= dist2_qt_min)]
 
-    val_max = max(sample_space)
-    val_min = min(sample_space)
+    val_max = np.max([np.max(dist1), np.max(dist2)])
+    val_min = np.min([np.min(dist1), np.min(dist2)])
 
     if val_max == val_min:
         return 0
 
+    # Scale the distribution between 0 and 1:
     dist1 = (dist1 - val_min) / (val_max - val_min)
     dist2 = (dist2 - val_min) / (val_max - val_min)
 
@@ -101,6 +117,7 @@ def calc_drift_and_plot(train_column: pd.Series,
                         value_name: Hashable,
                         column_type: str,
                         plot_title: Optional[str] = None,
+                        margin_quantile_filter: float = 0,
                         max_num_categories_for_drift: int = 10,
                         max_num_categories_for_display: int = 10,
                         show_categories_by: str = 'train_largest',
@@ -120,6 +137,10 @@ def calc_drift_and_plot(train_column: pd.Series,
         type of column (either "numerical" or "categorical")
     plot_title: str or None
         if None use value_name as title otherwise use this.
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Max number of allowed categories. If there are more, they are binned into an "Other" category.
     max_num_categories_for_display: int, default: 10
@@ -152,7 +173,7 @@ def calc_drift_and_plot(train_column: pd.Series,
         train_dist = train_dist.astype('float')
         test_dist = test_dist.astype('float')
 
-        score = earth_movers_distance(dist1=train_dist, dist2=test_dist)
+        score = earth_movers_distance(dist1=train_dist, dist2=test_dist, margin_quantile_filter=margin_quantile_filter)
         bar_traces, bar_x_axis, bar_y_axis = drift_score_bar_traces(score)
 
         dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(train_dist, test_dist, value_name)

diff --git a/deepchecks/vision/checks/distribution/image_property_drift.py b/deepchecks/vision/checks/distribution/image_property_drift.py
@@ -45,6 +45,10 @@ class ImagePropertyDrift(TrainTestCheck):
         List of properties. Replaces the default deepchecks properties.
         Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
         representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for non-continuous properties. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If None, there is no limit.
@@ -68,6 +72,7 @@ class ImagePropertyDrift(TrainTestCheck):
     def __init__(
             self,
             image_properties: t.List[t.Dict[str, t.Any]] = None,
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
@@ -83,6 +88,7 @@ def __init__(
         else:
             self.image_properties = default_image_properties
 
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
@@ -181,6 +187,7 @@ def compute(self, context: Context) -> CheckResult:
                     test_column=df_test[property_name],
                     value_name=property_name,
                     column_type=get_column_type(single_property['output_type']),
+                    margin_quantile_filter=self.margin_quantile_filter,
                     max_num_categories_for_drift=self.max_num_categories_for_drift,
                     max_num_categories_for_display=self.max_num_categories_for_display,
                     show_categories_by=self.show_categories_by,

diff --git a/deepchecks/vision/checks/distribution/train_test_label_drift.py b/deepchecks/vision/checks/distribution/train_test_label_drift.py
@@ -60,6 +60,10 @@ class TrainTestLabelDrift(TrainTestCheck):
         List of properties. Replaces the default deepchecks properties.
         Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
         representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id'
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for non-continuous properties. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
@@ -79,6 +83,7 @@ class TrainTestLabelDrift(TrainTestCheck):
     def __init__(
             self,
             label_properties: List[Dict[str, Any]] = None,
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
@@ -90,6 +95,7 @@ def __init__(
         if label_properties is not None:
             validate_properties(label_properties)
         self.user_label_properties = label_properties
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
@@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult:
                 test_column=pd.Series(self._test_label_properties[name]),
                 value_name=name,
                 column_type=get_column_type(output_type),
+                margin_quantile_filter=self.margin_quantile_filter,
                 max_num_categories_for_drift=self.max_num_categories_for_drift,
                 max_num_categories_for_display=self.max_num_categories_for_display,
                 show_categories_by=self.show_categories_by

diff --git a/deepchecks/vision/checks/distribution/train_test_prediction_drift.py b/deepchecks/vision/checks/distribution/train_test_prediction_drift.py
@@ -63,6 +63,10 @@ class TrainTestPredictionDrift(TrainTestCheck):
         List of properties. Replaces the default deepchecks properties.
         Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
         representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id'
+    margin_quantile_filter: float, default: 0
+        float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
+        out of the EMD calculation. This is done in order for extreme values not to affect the calculation
+        disproportionally. This filter is applied to both distributions, in both margins.
     max_num_categories_for_drift: int, default: 10
         Only for non-continues columns. Max number of allowed categories. If there are more,
         they are binned into an "Other" category. If None, there is no limit.
@@ -81,6 +85,7 @@ class TrainTestPredictionDrift(TrainTestCheck):
     def __init__(
             self,
             prediction_properties: List[Dict[str, Any]] = None,
+            margin_quantile_filter: float = 0,
             max_num_categories_for_drift: int = 10,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'train_largest',
@@ -92,6 +97,7 @@ def __init__(
         if prediction_properties is not None:
             validate_properties(prediction_properties)
         self.user_prediction_properties = prediction_properties
+        self.margin_quantile_filter = margin_quantile_filter
         if max_num_categories is not None:
             warnings.warn(
                 f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
@@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult:
                 test_column=pd.Series(self._test_prediction_properties[name]),
                 value_name=name,
                 column_type=get_column_type(output_type),
+                margin_quantile_filter=self.margin_quantile_filter,
                 max_num_categories_for_drift=self.max_num_categories_for_drift,
                 max_num_categories_for_display=self.max_num_categories_for_display,
                 show_categories_by=self.show_categories_by

diff --git a/tests/utils/drift_test.py b/tests/utils/drift_test.py
@@ -0,0 +1,56 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""Test drift utils"""
+from hamcrest import assert_that, equal_to, raises, close_to, calling
+
+from deepchecks.core.errors import DeepchecksValueError
+from deepchecks.utils.distribution.drift import earth_movers_distance
+
+import numpy as np
+
+
+def test_emd():
+    dist1 = np.ones(100)
+    dist2 = np.zeros(100)
+    res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
+    assert_that(res, equal_to(1))
+
+
+def test_real_input():
+    # Move half of the dirt (0-50) to 2/3 of the distance (100-150) with the middle (50-100) staying unmoved.
+    # Therefore, result should be 1/2 * 2/3 = 1/3
+    dist1 = np.array(range(100))
+    dist2 = np.array(range(50, 150))
+    res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
+    assert_that(res, close_to(0.33, 0.01))
+
+
+def test_emd_scaling():
+    dist1 = np.ones(100) * 10
+    dist2 = np.zeros(100)
+    res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
+    assert_that(res, equal_to(1))
+
+
+def test_emd_margin_filter():
+    dist1 = np.concatenate([np.ones(99) * 10, np.ones(1) * 100])
+    dist2 = np.concatenate([np.zeros(99), np.ones(1)])
+    res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0.01)
+    assert_that(res, equal_to(1))
+
+
+def test_emd_raises_exception():
+    dist1 = np.ones(100)
+    dist2 = np.zeros(100)
+    assert_that(
+        calling(earth_movers_distance).with_args(dist1, dist2, -1),
+        raises(DeepchecksValueError, r'margin_quantile_filter expected a value in range \[0, 0.5\), instead got -1')
+    )