Skip to content

Commit

Permalink
Added margin_quantile_filter to EMD drift score (#1368)
Browse files Browse the repository at this point in the history
* Added margin_quantile_filter to EMD and the checks that use it.
Added tests for EMD.
Right now changed the default setting of margin_quantile_filter to 0 (values do not change).

* Fixed comment + added raises test that was forgotten

* removed file
  • Loading branch information
nirhutnik committed May 3, 2022
1 parent b708e14 commit 3f09614
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 10 deletions.
Expand Up @@ -49,6 +49,10 @@ class TrainTestFeatureDrift(TrainTestCheck):
sort_feature_by : str , default: feature importance
Indicates how features will be sorted. Can be either "feature importance"
or "drift score"
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If None, there is no limit.
Expand All @@ -74,6 +78,7 @@ def __init__(
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 5,
sort_feature_by: str = 'feature importance',
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
Expand All @@ -85,6 +90,7 @@ def __init__(
super().__init__(**kwargs)
self.columns = columns
self.ignore_columns = ignore_columns
self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
Expand Down Expand Up @@ -159,6 +165,7 @@ def run_logic(self, context: Context) -> CheckResult:
value_name=column,
column_type=column_type,
plot_title=plot_title,
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
Expand Down
Expand Up @@ -34,6 +34,10 @@ class TrainTestLabelDrift(TrainTestCheck):
Parameters
----------
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If None, there is no limit.
Expand All @@ -51,13 +55,15 @@ class TrainTestLabelDrift(TrainTestCheck):

def __init__(
self,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
max_num_categories: int = None,
**kwargs
):
super().__init__(**kwargs)
self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
'max_num_categories is deprecated. please use max_num_categories_for_drift and '
Expand Down Expand Up @@ -87,6 +93,7 @@ def run_logic(self, context: Context) -> CheckResult:
test_column=test_dataset.label_col,
value_name=train_dataset.label_name,
column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
Expand Down
Expand Up @@ -37,6 +37,10 @@ class TrainTestPredictionDrift(TrainTestCheck):
Parameters
----------
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If None, there is no limit.
Expand All @@ -54,13 +58,15 @@ class TrainTestPredictionDrift(TrainTestCheck):

def __init__(
self,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
max_num_categories: int = None, # Deprecated
**kwargs
):
super().__init__(**kwargs)
self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
Expand Down Expand Up @@ -94,6 +100,7 @@ def run_logic(self, context: Context) -> CheckResult:
test_column=pd.Series(test_prediction),
value_name='model predictions',
column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
Expand Down
41 changes: 31 additions & 10 deletions deepchecks/utils/distribution/drift.py
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Common utilities for distribution checks."""

from numbers import Number
from typing import Callable, Hashable, Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -59,7 +59,8 @@ def psi(expected_percents: np.ndarray, actual_percents: np.ndarray):
return psi_value


def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series]):
def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.Series],
margin_quantile_filter: float):
"""
Calculate the Earth Movers Distance (Wasserstein distance).
Expand All @@ -69,27 +70,42 @@ def earth_movers_distance(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.n
Parameters
----------
dist1 : Union[np.ndarray, pd.Series]
dist1: Union[np.ndarray, pd.Series]
array of numberical values.
dist2 : Union[np.ndarray, pd.Series]
dist2: Union[np.ndarray, pd.Series]
array of numberical values to compare dist1 to.
margin_quantile_filter: float
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
Returns
-------
Any
the Wasserstein distance between the two distributions.
Raises
-------
DeepchecksValueError
if the value of margin_quantile_filter is not in range [0, 0.5)
"""
unique1 = np.unique(dist1)
unique2 = np.unique(dist2)
if not isinstance(margin_quantile_filter, Number) or margin_quantile_filter < 0 or margin_quantile_filter >= 0.5:
raise DeepchecksValueError(
f'margin_quantile_filter expected a value in range [0, 0.5), instead got {margin_quantile_filter}')

sample_space = list(set(unique1).union(set(unique2)))
if margin_quantile_filter != 0:
dist1_qt_min, dist1_qt_max = np.quantile(dist1, [margin_quantile_filter, 1-margin_quantile_filter])
dist2_qt_min, dist2_qt_max = np.quantile(dist2, [margin_quantile_filter, 1-margin_quantile_filter])
dist1 = dist1[(dist1_qt_max >= dist1) & (dist1 >= dist1_qt_min)]
dist2 = dist2[(dist2_qt_max >= dist2) & (dist2 >= dist2_qt_min)]

val_max = max(sample_space)
val_min = min(sample_space)
val_max = np.max([np.max(dist1), np.max(dist2)])
val_min = np.min([np.min(dist1), np.min(dist2)])

if val_max == val_min:
return 0

# Scale the distribution between 0 and 1:
dist1 = (dist1 - val_min) / (val_max - val_min)
dist2 = (dist2 - val_min) / (val_max - val_min)

Expand All @@ -101,6 +117,7 @@ def calc_drift_and_plot(train_column: pd.Series,
value_name: Hashable,
column_type: str,
plot_title: Optional[str] = None,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
Expand All @@ -120,6 +137,10 @@ def calc_drift_and_plot(train_column: pd.Series,
type of column (either "numerical" or "categorical")
plot_title: str or None
if None use value_name as title otherwise use this.
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Max number of allowed categories. If there are more, they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Expand Down Expand Up @@ -152,7 +173,7 @@ def calc_drift_and_plot(train_column: pd.Series,
train_dist = train_dist.astype('float')
test_dist = test_dist.astype('float')

score = earth_movers_distance(dist1=train_dist, dist2=test_dist)
score = earth_movers_distance(dist1=train_dist, dist2=test_dist, margin_quantile_filter=margin_quantile_filter)
bar_traces, bar_x_axis, bar_y_axis = drift_score_bar_traces(score)

dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(train_dist, test_dist, value_name)
Expand Down
7 changes: 7 additions & 0 deletions deepchecks/vision/checks/distribution/image_property_drift.py
Expand Up @@ -45,6 +45,10 @@ class ImagePropertyDrift(TrainTestCheck):
List of properties. Replaces the default deepchecks properties.
Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for non-continuous properties. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If None, there is no limit.
Expand All @@ -68,6 +72,7 @@ class ImagePropertyDrift(TrainTestCheck):
def __init__(
self,
image_properties: t.List[t.Dict[str, t.Any]] = None,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
Expand All @@ -83,6 +88,7 @@ def __init__(
else:
self.image_properties = default_image_properties

self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
Expand Down Expand Up @@ -181,6 +187,7 @@ def compute(self, context: Context) -> CheckResult:
test_column=df_test[property_name],
value_name=property_name,
column_type=get_column_type(single_property['output_type']),
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by,
Expand Down
Expand Up @@ -60,6 +60,10 @@ class TrainTestLabelDrift(TrainTestCheck):
List of properties. Replaces the default deepchecks properties.
Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id'
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for non-continuous properties. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
Expand All @@ -79,6 +83,7 @@ class TrainTestLabelDrift(TrainTestCheck):
def __init__(
self,
label_properties: List[Dict[str, Any]] = None,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
Expand All @@ -90,6 +95,7 @@ def __init__(
if label_properties is not None:
validate_properties(label_properties)
self.user_label_properties = label_properties
self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
Expand Down Expand Up @@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult:
test_column=pd.Series(self._test_label_properties[name]),
value_name=name,
column_type=get_column_type(output_type),
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
Expand Down
Expand Up @@ -63,6 +63,10 @@ class TrainTestPredictionDrift(TrainTestCheck):
List of properties. Replaces the default deepchecks properties.
Each property is dictionary with keys 'name' (str), 'method' (Callable) and 'output_type' (str),
representing attributes of said method. 'output_type' must be one of 'continuous'/'discrete'/'class_id'
margin_quantile_filter: float, default: 0
float in range [0,0.5), representing which margins (high and low quantiles) of the distribution will be filtered
out of the EMD calculation. This is done in order for extreme values not to affect the calculation
disproportionally. This filter is applied to both distributions, in both margins.
max_num_categories_for_drift: int, default: 10
Only for non-continues columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If None, there is no limit.
Expand All @@ -81,6 +85,7 @@ class TrainTestPredictionDrift(TrainTestCheck):
def __init__(
self,
prediction_properties: List[Dict[str, Any]] = None,
margin_quantile_filter: float = 0,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
Expand All @@ -92,6 +97,7 @@ def __init__(
if prediction_properties is not None:
validate_properties(prediction_properties)
self.user_prediction_properties = prediction_properties
self.margin_quantile_filter = margin_quantile_filter
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
Expand Down Expand Up @@ -178,6 +184,7 @@ def compute(self, context: Context) -> CheckResult:
test_column=pd.Series(self._test_prediction_properties[name]),
value_name=name,
column_type=get_column_type(output_type),
margin_quantile_filter=self.margin_quantile_filter,
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
Expand Down
56 changes: 56 additions & 0 deletions tests/utils/drift_test.py
@@ -0,0 +1,56 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Test drift utils"""
from hamcrest import assert_that, equal_to, raises, close_to, calling

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.utils.distribution.drift import earth_movers_distance

import numpy as np


def test_emd():
dist1 = np.ones(100)
dist2 = np.zeros(100)
res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
assert_that(res, equal_to(1))


def test_real_input():
# Move half of the dirt (0-50) to 2/3 of the distance (100-150) with the middle (50-100) staying unmoved.
# Therefore, result should be 1/2 * 2/3 = 1/3
dist1 = np.array(range(100))
dist2 = np.array(range(50, 150))
res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
assert_that(res, close_to(0.33, 0.01))


def test_emd_scaling():
dist1 = np.ones(100) * 10
dist2 = np.zeros(100)
res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0)
assert_that(res, equal_to(1))


def test_emd_margin_filter():
dist1 = np.concatenate([np.ones(99) * 10, np.ones(1) * 100])
dist2 = np.concatenate([np.zeros(99), np.ones(1)])
res = earth_movers_distance(dist1=dist1, dist2=dist2, margin_quantile_filter=0.01)
assert_that(res, equal_to(1))


def test_emd_raises_exception():
dist1 = np.ones(100)
dist2 = np.zeros(100)
assert_that(
calling(earth_movers_distance).with_args(dist1, dist2, -1),
raises(DeepchecksValueError, r'margin_quantile_filter expected a value in range \[0, 0.5\), instead got -1')
)

0 comments on commit 3f09614

Please sign in to comment.