diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
index c14bc2bca1..4c797ea56e 100644
--- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
+++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
@@ -10,16 +10,12 @@
#
"""Module contains Prediction Drift check."""
-from typing import Dict
-
import numpy as np
-from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.nlp import Context, TrainTestCheck
from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
-from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
-from deepchecks.utils.strings import format_number
__all__ = ['PredictionDrift']
@@ -162,53 +158,5 @@ def run_logic(self, context: Context) -> CheckResult:
train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1))
test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1))
- return self.prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
- proba_drift, not proba_drift)
-
- def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
- max_allowed_numeric_score: float = 0.15):
- """
- Add condition - require drift score to be less than a certain threshold.
-
- The industry standard for PSI limit is above 0.2.
- There are no common industry standards for other drift methods, such as Cramer's V,
- Kolmogorov-Smirnov and Earth Mover's Distance.
- The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
- drift.
-
- Parameters
- ----------
- max_allowed_categorical_score: float , default: 0.15
- the max threshold for the categorical variable drift score
- max_allowed_numeric_score: float , default: 0.15
- the max threshold for the numeric variable drift score
- Returns
- -------
- ConditionResult
- False if any distribution has passed the max threshold, True otherwise
- """
-
- def condition(result: Dict) -> ConditionResult:
- drift_score_dict = result['Drift score']
- # Move to dict for easier looping
- if not isinstance(drift_score_dict, dict):
- drift_score_dict = {0: drift_score_dict}
- method = result['Method']
- has_failed = {}
- drift_score = 0
- for class_name, drift_score in drift_score_dict.items():
- has_failed[class_name] = \
- (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
- (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
- if len(has_failed) == 1:
- details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
- else:
- details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
- f' score above threshold: {max_allowed_numeric_score}.'
- category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
- return ConditionResult(category, details)
-
- return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
- f'numerical drift score < {max_allowed_numeric_score}',
- condition)
+ return self._prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
+ proba_drift, not proba_drift)
diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py
index b9f062b37e..98ed014215 100644
--- a/deepchecks/nlp/checks/train_test_validation/label_drift.py
+++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py
@@ -8,24 +8,17 @@
# along with Deepchecks. If not, see .
# ----------------------------------------------------------------------------
#
-"""Module contains Label Drift check."""
-
-from typing import Dict
-import pandas as pd
+"""Module contains Label Drift check."""
-from deepchecks.core import CheckResult, ConditionResult
-from deepchecks.core.condition import ConditionCategory
-from deepchecks.core.errors import DeepchecksValueError
+from deepchecks.core import CheckResult
from deepchecks.nlp import Context, TrainTestCheck
-from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,
- calc_drift_and_plot, get_drift_plot_sidenote)
-from deepchecks.utils.strings import format_number
+from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract
__all__ = ['LabelDrift']
-class LabelDrift(TrainTestCheck):
+class LabelDrift(TrainTestCheck, LabelDriftAbstract):
"""
Calculate label drift between train dataset and test dataset, using statistical measures.
@@ -41,15 +34,17 @@ class LabelDrift(TrainTestCheck):
small number of samples (common practice is categories with less than 5 samples).
However, in cases of a variable with many categories with few samples, it is still recommended to use Cramer's V.
+ **Note:** In case of highly imbalanced classes, it is recommended to use Cramer's V, together with setting
+ the ``balance_classes`` parameter to ``True``.
Parameters
----------
min_category_size_ratio: float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
- into an "Other" category.
+ into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift: int, default: None
- Max number of allowed categories. If there are more,
- they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots
+ Only for classification. Max number of allowed categories. If there are more,
+ they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'largest_difference'
@@ -58,23 +53,24 @@ class LabelDrift(TrainTestCheck):
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
- numerical_drift_method: str, default: "KS"
- decides which method to use on numerical variables. Possible values are:
- "EMD" for Earth Mover's Distance (EMD), "KS" for Kolmogorov-Smirnov (KS).
categorical_drift_method: str, default: "cramers_v"
decides which method to use on categorical variables. Possible values are:
"cramers_v" for Cramer's V, "PSI" for Population Stability Index (PSI).
balance_classes: bool, default: False
If True, all categories will have an equal weight in the Cramer's V score. This is useful when the categorical
variable is highly imbalanced, and we want to be alerted on changes in proportion to the category size,
- and not only to the entire dataset. Must have categorical_drift_method = "cramers_v" and
- drift_mode = "auto" or "prediction".
+ and not only to the entire dataset. Must have categorical_drift_method = "cramers_v".
If True, the variable frequency plot will be created with a log scale in the y-axis.
- ignore_na: bool, default True
+ ignore_na: bool, default False
For categorical columns only. If True, ignores nones for categorical drift. If False, considers none as a
separate category. For numerical columns we always ignore nones.
+ min_samples : int , default: 10
+ Minimum number of samples required to calculate the drift score. If there are not enough samples for either
+ train or test, the check will raise a ``NotEnoughSamplesError`` exception.
n_samples : int , default: 100_000
Number of samples to use for drift computation and plot.
+ random_state : int , default: 42
+ Random seed for sampling.
"""
def __init__(
@@ -83,29 +79,30 @@ def __init__(
min_category_size_ratio: float = 0.01,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'largest_difference',
- numerical_drift_method: str = 'KS',
categorical_drift_method: str = 'cramers_v',
balance_classes: bool = False,
- ignore_na: bool = True,
+ ignore_na: bool = False,
+ min_samples: int = 10,
n_samples: int = 100_000,
+ random_state: int = 42,
**kwargs
):
- if show_categories_by not in ('train_largest', 'test_largest', 'largest_difference'):
- raise DeepchecksValueError(
- 'show_categories_by must be one of "train_largest", "test_largest", "largest_difference"')
super().__init__(**kwargs)
+ # self.margin_quantile_filter = margin_quantile_filter
self.max_num_categories_for_drift = max_num_categories_for_drift
self.min_category_size_ratio = min_category_size_ratio
self.max_num_categories_for_display = max_num_categories_for_display
self.show_categories_by = show_categories_by
- self.numerical_drift_method = numerical_drift_method
+ # self.numerical_drift_method = numerical_drift_method
self.categorical_drift_method = categorical_drift_method
self.balance_classes = balance_classes
self.ignore_na = ignore_na
+ self.min_samples = min_samples
self.n_samples = n_samples
+ self.random_state = random_state
def run_logic(self, context: Context) -> CheckResult:
- """Calculate drift for the label.
+ """Calculate drift for all columns.
Returns
-------
@@ -113,74 +110,8 @@ def run_logic(self, context: Context) -> CheckResult:
value: drift score.
display: label distribution graph, comparing the train and test distributions.
"""
- context.raise_if_token_classification_task(self)
- context.raise_if_multi_label_task(self)
-
- train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
- test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)
-
- drift_score, method, display = calc_drift_and_plot(
- train_column=pd.Series(train_dataset.label),
- test_column=pd.Series(test_dataset.label),
- value_name='Label',
- column_type='categorical',
- max_num_categories_for_drift=self.max_num_categories_for_drift,
- min_category_size_ratio=self.min_category_size_ratio,
- max_num_categories_for_display=self.max_num_categories_for_display,
- show_categories_by=self.show_categories_by,
- numerical_drift_method=self.numerical_drift_method,
- categorical_drift_method=self.categorical_drift_method,
- balance_classes=self.balance_classes,
- ignore_na=self.ignore_na,
- with_display=context.with_display,
- dataset_names=(train_dataset.name, test_dataset.name)
- )
-
- values_dict = {'Drift score': drift_score, 'Method': method}
-
- if context.with_display:
- displays = ["""
- The Drift score is a measure for the difference between two distributions, in this check - the test
- and train distributions.
The check shows the drift score and distributions for the label.
- """, get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display]
- else:
- displays = None
-
- return CheckResult(value=values_dict, display=displays, header='Train Test Label Drift')
-
- def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
- max_allowed_numeric_score: float = 0.15):
- """
- Add condition - require drift score to be less than the threshold.
-
- The industry standard for PSI limit is above 0.2.
- There are no common industry standards for other drift methods, such as Cramer's V,
- Kolmogorov-Smirnov and Earth Mover's Distance.
- The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
- drift.
-
- Parameters
- ----------
- max_allowed_categorical_score: float , default: 0.2
- the max threshold for the categorical variable drift score
- max_allowed_numeric_score: float , default: 0.15
- the max threshold for the numeric variable drift score
- Returns
- -------
- ConditionResult
- False if any column has passed the max threshold, True otherwise
- """
-
- def condition(result: Dict) -> ConditionResult:
- drift_score = result['Drift score']
- method = result['Method']
- has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
- (drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
- details = f'Label\'s drift score {method} is {format_number(drift_score)}'
- category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS
- return ConditionResult(category, details)
+ train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
+ test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)
- return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
- f'numerical drift score < {max_allowed_numeric_score} for label drift',
- condition)
+ return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label',
+ 'categorical', context.with_display, (train_dataset.name, test_dataset.name))
diff --git a/deepchecks/tabular/checks/model_evaluation/prediction_drift.py b/deepchecks/tabular/checks/model_evaluation/prediction_drift.py
index f8670c440a..c13e8bbf58 100644
--- a/deepchecks/tabular/checks/model_evaluation/prediction_drift.py
+++ b/deepchecks/tabular/checks/model_evaluation/prediction_drift.py
@@ -15,14 +15,12 @@
import numpy as np
-from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.core.reduce_classes import ReduceMixin
from deepchecks.tabular import Context, TrainTestCheck
from deepchecks.tabular.utils.task_type import TaskType
from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
-from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
-from deepchecks.utils.strings import format_number
__all__ = ['PredictionDrift']
@@ -193,8 +191,8 @@ def run_logic(self, context: Context) -> CheckResult:
train_pred = np.array(model.predict(train_dataset.features_columns)).reshape((-1, 1))
test_pred = np.array(model.predict(test_dataset.features_columns)).reshape((-1, 1))
- return self.prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
- (context.task_type != TaskType.REGRESSION) and (not proba_drift))
+ return self._prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
+ (context.task_type != TaskType.REGRESSION) and (not proba_drift))
def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
"""Return prediction drift score."""
@@ -217,51 +215,3 @@ def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
def greater_is_better(self):
"""Return True if the check reduce_output is better when it is greater."""
return False
-
- def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
- max_allowed_numeric_score: float = 0.15):
- """
- Add condition - require drift score to be less than a certain threshold.
-
- The industry standard for PSI limit is above 0.2.
- There are no common industry standards for other drift methods, such as Cramer's V,
- Kolmogorov-Smirnov and Earth Mover's Distance.
- The threshold was lowered by 25% compared to feature drift defaults due to the higher importance of prediction
- drift.
-
- Parameters
- ----------
- max_allowed_categorical_score: float , default: 0.15
- the max threshold for the categorical variable drift score
- max_allowed_numeric_score: float , default: 0.15
- the max threshold for the numeric variable drift score
- Returns
- -------
- ConditionResult
- False if any column has passed the max threshold, True otherwise
- """
-
- def condition(result: t.Dict) -> ConditionResult:
- drift_score_dict = result['Drift score']
- # Move to dict for easier looping
- if not isinstance(drift_score_dict, dict):
- drift_score_dict = {0: drift_score_dict}
- method = result['Method']
- has_failed = {}
- drift_score = 0
- for class_name, drift_score in drift_score_dict.items():
- has_failed[class_name] = \
- (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
- (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
- if len(has_failed) == 1:
- details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
- else:
- details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
- f' score above threshold: {max_allowed_numeric_score}.'
- category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
- return ConditionResult(category, details)
-
- return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
- f'numerical drift score < {max_allowed_numeric_score}',
- condition)
diff --git a/deepchecks/tabular/checks/train_test_validation/label_drift.py b/deepchecks/tabular/checks/train_test_validation/label_drift.py
index b89de4e02c..0f475b8907 100644
--- a/deepchecks/tabular/checks/train_test_validation/label_drift.py
+++ b/deepchecks/tabular/checks/train_test_validation/label_drift.py
@@ -13,19 +13,16 @@
from typing import Dict
-from deepchecks.core import CheckResult, ConditionResult
-from deepchecks.core.condition import ConditionCategory
+from deepchecks.core import CheckResult
from deepchecks.core.reduce_classes import ReduceLabelMixin
from deepchecks.tabular import Context, TrainTestCheck
from deepchecks.tabular.utils.task_type import TaskType
-from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,
- calc_drift_and_plot, get_drift_plot_sidenote)
-from deepchecks.utils.strings import format_number
+from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract
__all__ = ['LabelDrift']
-class LabelDrift(TrainTestCheck, ReduceLabelMixin):
+class LabelDrift(TrainTestCheck, LabelDriftAbstract, ReduceLabelMixin):
"""
Calculate label drift between train dataset and test dataset, using statistical measures.
@@ -134,37 +131,10 @@ def run_logic(self, context: Context) -> CheckResult:
train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)
- drift_score, method, display = calc_drift_and_plot(
- train_column=train_dataset.label_col,
- test_column=test_dataset.label_col,
- value_name=train_dataset.label_name,
- column_type='categorical' if context.task_type != TaskType.REGRESSION else 'numerical',
- margin_quantile_filter=self.margin_quantile_filter,
- max_num_categories_for_drift=self.max_num_categories_for_drift,
- min_category_size_ratio=self.min_category_size_ratio,
- max_num_categories_for_display=self.max_num_categories_for_display,
- show_categories_by=self.show_categories_by,
- numerical_drift_method=self.numerical_drift_method,
- categorical_drift_method=self.categorical_drift_method,
- balance_classes=self.balance_classes,
- ignore_na=self.ignore_na,
- min_samples=self.min_samples,
- raise_min_samples_error=True,
- with_display=context.with_display,
- dataset_names=(train_dataset.name, test_dataset.name)
- )
-
- values_dict = {'Drift score': drift_score, 'Method': method}
-
- if context.with_display:
- displays = ["""
- The Drift score is a measure for the difference between two distributions, in this check - the test
- and train distributions.
The check shows the drift score and distributions for the label.
- """, get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display]
- else:
- displays = None
-
- return CheckResult(value=values_dict, display=displays, header='Label Drift')
+ column_type = 'categorical' if context.task_type != TaskType.REGRESSION else 'numerical'
+
+ return self._calculate_label_drift(train_dataset.label_col, test_dataset.label_col, train_dataset.label_name,
+ column_type, context.with_display, (train_dataset.name, test_dataset.name))
def reduce_output(self, check_result: CheckResult) -> Dict[str, float]:
"""Return label drift score."""
@@ -173,37 +143,3 @@ def reduce_output(self, check_result: CheckResult) -> Dict[str, float]:
def greater_is_better(self):
"""Return True if the check reduce_output is better when it is greater."""
return False
-
- def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
- max_allowed_numeric_score: float = 0.15):
- """
- Add condition - require drift score to be less than the threshold.
-
- The industry standard for PSI limit is above 0.2.
- There are no common industry standards for other drift methods, such as Cramer's V,
- Kolmogorov-Smirnov and Earth Mover's Distance.
-
- Parameters
- ----------
- max_allowed_categorical_score: float , default: 0.15
- the max threshold for the categorical variable drift score
- max_allowed_numeric_score: float , default: 0.15
- the max threshold for the numeric variable drift score
- Returns
- -------
- ConditionResult
- False if any column has passed the max threshold, True otherwise
- """
- def condition(result: Dict) -> ConditionResult:
- drift_score = result['Drift score']
- method = result['Method']
- has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
- (drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
- details = f'Label\'s drift score {method} is {format_number(drift_score)}'
- category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS
- return ConditionResult(category, details)
-
- return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
- f'numerical drift score < {max_allowed_numeric_score} for label drift',
- condition)
diff --git a/deepchecks/utils/abstracts/label_drift.py b/deepchecks/utils/abstracts/label_drift.py
new file mode 100644
index 0000000000..c362dd3f5f
--- /dev/null
+++ b/deepchecks/utils/abstracts/label_drift.py
@@ -0,0 +1,104 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks. If not, see .
+# ----------------------------------------------------------------------------
+#
+"""The base abstract functionality for label drift checks."""
+import abc
+import typing as t
+
+import pandas as pd
+
+from deepchecks import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.utils.distribution.drift import calc_drift_and_plot, get_drift_plot_sidenote
+from deepchecks.utils.strings import format_number
+
+__all__ = ['LabelDriftAbstract']
+
+
+class LabelDriftAbstract(abc.ABC):
+ """Base class for label drift checks."""
+
+ margin_quantile_filter: float = 0.025
+ max_num_categories_for_drift: t.Optional[int]
+ min_category_size_ratio: float
+ max_num_categories_for_display: t.Optional[int]
+ show_categories_by: str
+ numerical_drift_method: str = 'KS'
+ categorical_drift_method: str
+ balance_classes: bool
+ ignore_na: bool
+ min_samples: int
+ n_samples: t.Optional[int]
+ random_state: int
+ add_condition: t.Callable[..., t.Any]
+
+ def _calculate_label_drift(self, train_column, test_column, label_name: str, column_type: str, with_display: bool,
+ dataset_names: t.Optional[t.Tuple[str, str]]) -> CheckResult:
+
+ drift_score, method, display = calc_drift_and_plot(
+ train_column=pd.Series(train_column),
+ test_column=pd.Series(test_column),
+ value_name=label_name,
+ column_type=column_type,
+ margin_quantile_filter=self.margin_quantile_filter,
+ max_num_categories_for_drift=self.max_num_categories_for_drift,
+ min_category_size_ratio=self.min_category_size_ratio,
+ max_num_categories_for_display=self.max_num_categories_for_display,
+ show_categories_by=self.show_categories_by,
+ numerical_drift_method=self.numerical_drift_method,
+ categorical_drift_method=self.categorical_drift_method,
+ balance_classes=self.balance_classes,
+ ignore_na=self.ignore_na,
+ min_samples=self.min_samples,
+ raise_min_samples_error=True,
+ with_display=with_display,
+ dataset_names=dataset_names
+ )
+
+ values_dict = {'Drift score': drift_score, 'Method': method}
+
+ if with_display:
+ displays = ["""
+ The Drift score is a measure for the difference between two distributions, in this check -
+ the test and train distributions.
The check shows the drift score
+ and distributions for the label. """,
+ get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by),
+ display]
+ else:
+ displays = None
+
+ return CheckResult(value=values_dict, display=displays, header='Label Drift')
+
+ def add_condition_drift_score_less_than(self, max_allowed_drift_score: float = 0.15):
+ """
+ Add condition - require drift score to be less than the threshold.
+
+ The industry standard for PSI limit is above 0.2.
+ There are no common industry standards for other drift methods, such as Cramer's V,
+ Kolmogorov-Smirnov and Earth Mover's Distance.
+
+ Parameters
+ ----------
+ max_allowed_drift_score: float , default: 0.15
+ the max threshold for the categorical variable drift score
+ Returns
+ -------
+ ConditionResult
+ False if any column has passed the max threshold, True otherwise
+ """
+
+ def condition(result: t.Dict) -> ConditionResult:
+ drift_score = result['Drift score']
+ method = result['Method']
+
+ details = f'Label\'s drift score {method} is {format_number(drift_score)}'
+ category = ConditionCategory.FAIL if drift_score > max_allowed_drift_score else ConditionCategory.PASS
+ return ConditionResult(category, details)
+
+ return self.add_condition(f'Label drift score < {max_allowed_drift_score}', condition)
diff --git a/deepchecks/utils/abstracts/prediction_drift.py b/deepchecks/utils/abstracts/prediction_drift.py
index b97e77fbdc..8e2a7eded2 100644
--- a/deepchecks/utils/abstracts/prediction_drift.py
+++ b/deepchecks/utils/abstracts/prediction_drift.py
@@ -8,19 +8,42 @@
# along with Deepchecks. If not, see .
# ----------------------------------------------------------------------------
#
-"""Module contains the Abstract cass for Prediction Drift checks."""
+"""The base abstract functionality for prediction drift checks."""
+import abc
+import typing as t
+
import numpy as np
import pandas as pd
-from deepchecks import CheckResult
+from deepchecks import CheckResult, ConditionCategory, ConditionResult
from deepchecks.utils.distribution.drift import calc_drift_and_plot, get_drift_plot_sidenote
+from deepchecks.utils.strings import format_number
+
+__all__ = ['PredictionDriftAbstract']
-class PredictionDriftAbstract:
- """Abstract class for prediction drift checks."""
+class PredictionDriftAbstract(abc.ABC):
+ """Base class for prediction drift checks."""
- def prediction_drift(self, train_prediction, test_prediction, model_classes, with_display,
- proba_drift, cat_plot) -> CheckResult:
+ drift_mode: str = 'auto'
+ margin_quantile_filter: float = 0.025
+ max_num_categories_for_drift: int = None
+ min_category_size_ratio: float = 0.01
+ max_num_categories_for_display: int = 10
+ show_categories_by: str = 'largest_difference'
+ numerical_drift_method: str = 'KS'
+ categorical_drift_method: str = 'cramers_v'
+ balance_classes: bool = False
+ ignore_na: bool = True
+ aggregation_method: t.Optional[str] = 'max'
+ max_classes_to_display: int = 3
+ min_samples: t.Optional[int] = 10
+ n_samples: int = 100_000
+ random_state: int = 42
+ add_condition: t.Callable[..., t.Any]
+
+ def _prediction_drift(self, train_prediction, test_prediction, model_classes, with_display,
+ proba_drift, cat_plot) -> CheckResult:
"""Calculate prediction drift.
Args:
@@ -116,3 +139,43 @@ def prediction_drift(self, train_prediction, test_prediction, model_classes, wit
'Method': method, 'Samples per class': samples_per_class}
return CheckResult(value=values_dict, display=displays, header='Prediction Drift')
+
+ def add_condition_drift_score_less_than(self, max_allowed_drift_score: float = 0.15):
+ """
+ Add condition - require drift score to be less than the threshold.
+
+ The industry standard for PSI limit is above 0.2.
+ There are no common industry standards for other drift methods, such as Cramer's V,
+ Kolmogorov-Smirnov and Earth Mover's Distance.
+
+ Parameters
+ ----------
+ max_allowed_drift_score: float , default: 0.15
+ the max threshold for the categorical variable drift score
+ Returns
+ -------
+ ConditionResult
+ False if any column has passed the max threshold, True otherwise
+ """
+
+ def condition(result: t.Dict) -> ConditionResult:
+ drift_score_dict = result['Drift score']
+ # Move to dict for easier looping
+ if not isinstance(drift_score_dict, dict):
+ drift_score_dict = {0: drift_score_dict}
+ method = result['Method']
+ has_failed = {}
+ drift_score = 0
+ for class_name, drift_score in drift_score_dict.items():
+ has_failed[class_name] = drift_score > max_allowed_drift_score
+
+ if len(has_failed) == 1:
+ details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
+ else:
+ details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
+ f' score above threshold: {max_allowed_drift_score}.'
+
+ category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
+ return ConditionResult(category, details)
+
+ return self.add_condition(f'Prediction drift score < {max_allowed_drift_score}', condition)
diff --git a/tests/conftest.py b/tests/conftest.py
index e8013966fe..c61bb17eab 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,10 +10,12 @@
#
"""Represents fixtures for unit testing using pytest."""
import logging
+import random
# pylint: skip-file
from typing import Tuple
import matplotlib.pyplot as plt
+import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import load_diabetes, load_iris
@@ -30,6 +32,12 @@
set_verbosity(logging.WARNING)
+@pytest.fixture(scope='function')
+def set_numpy_seed():
+ np.random.seed(42)
+ random.seed(42)
+
+
@pytest.fixture(scope='session')
def multi_index_dataframe():
"""Return a multi-indexed DataFrame."""
diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py
index 1e8a24369f..7c28711bb1 100644
--- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py
+++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py
@@ -31,7 +31,7 @@ def test_tweet_emotion(tweet_emotion_train_test_textdata, tweet_emotion_train_te
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details="Found model prediction Cramer's V drift score of 0.04",
- name='categorical drift score < 0.01 and numerical drift score < 0.15')
+ name='Prediction drift score < 0.01')
))
assert_that(result.value['Drift score'], close_to(0.04, 0.01))
@@ -50,7 +50,7 @@ def test_tweet_emotion_no_drift(tweet_emotion_train_test_textdata, tweet_emotion
assert_that(condition_result, has_items(
equal_condition_result(is_pass=True,
details="Found model prediction Cramer's V drift score of 0",
- name='categorical drift score < 0.15 and numerical drift score < 0.15')
+ name='Prediction drift score < 0.15')
))
assert_that(result.value['Drift score'], equal_to(0))
@@ -71,7 +71,7 @@ def test_tweet_emotion_no_drift_no_label(tweet_emotion_train_test_textdata, twee
assert_that(condition_result, has_items(
equal_condition_result(is_pass=True,
details="Found model prediction Cramer's V drift score of 0",
- name='categorical drift score < 0.15 and numerical drift score < 0.15')
+ name='Prediction drift score < 0.15')
))
assert_that(result.value['Drift score'], equal_to(0))
diff --git a/tests/nlp/checks/train_test_validation/label_drift_test.py b/tests/nlp/checks/train_test_validation/label_drift_test.py
index aaa6a006a9..29e6acf9d5 100644
--- a/tests/nlp/checks/train_test_validation/label_drift_test.py
+++ b/tests/nlp/checks/train_test_validation/label_drift_test.py
@@ -28,7 +28,7 @@ def test_tweet_emotion(tweet_emotion_train_test_textdata):
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details="Label's drift score Cramer's V is 0.22",
- name='categorical drift score < 0.1 and numerical drift score < 0.15 for label drift')
+ name='Label drift score < 0.1')
))
assert_that(result.value['Drift score'], close_to(0.23, 0.01))
@@ -46,7 +46,26 @@ def test_tweet_emotion_no_drift(tweet_emotion_train_test_textdata):
assert_that(condition_result, has_items(
equal_condition_result(is_pass=True,
details="Label's drift score Cramer's V is 0",
- name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift')
+ name='Label drift score < 0.15')
))
assert_that(result.value['Drift score'], close_to(0, 0.01))
+
+
+def test_multi_label_without_drift(dummy_multilabel_textdata_train_test):
+ # Arrange
+ train, test = dummy_multilabel_textdata_train_test
+ check = LabelDrift(min_samples=20).add_condition_drift_score_less_than()
+ # Act
+ result = check.run(train_dataset=train, test_dataset=test)
+ condition_result = check.conditions_decision(result)
+
+ # Assert
+ assert_that(condition_result, has_items(
+ equal_condition_result(is_pass=True,
+ details="Label's drift score Cramer's V is 0",
+ name='Label drift score < 0.15')
+ ))
+ assert_that(result.value['Drift score'], close_to(0, 0.01))
+
+
diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py
index fdffa13f4a..8d820b77bb 100644
--- a/tests/nlp/checks/train_test_validation/property_drift_test.py
+++ b/tests/nlp/checks/train_test_validation/property_drift_test.py
@@ -15,6 +15,7 @@
from deepchecks.nlp.checks import PropertyDrift
from deepchecks.nlp.text_data import TextData
+from tests.base.utils import equal_condition_result
class TestTextClassification:
@@ -31,10 +32,10 @@ def test_without_drift(self, tweet_emotion_train_test_textdata):
assert condition_results[0].is_pass() is True
assert_that(result.value, has_entries({
- "Formality": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov","Importance": None},
+ "Formality": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
"Language": {"Drift score": 0.0, "Method": "Cramer's V", "Importance": None},
"Subjectivity": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
- "Average Word Length": {"Drift score": 0.0,"Method": "Kolmogorov-Smirnov", "Importance": None},
+ "Average Word Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
"Text Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
"Max Word Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
"Toxicity": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
@@ -43,7 +44,6 @@ def test_without_drift(self, tweet_emotion_train_test_textdata):
"Fluency": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None},
})) # type: ignore
-
def test_with_drift(self, tweet_emotion_train_test_textdata):
# Arrange
train, test = tweet_emotion_train_test_textdata
@@ -147,9 +147,9 @@ def test_with_drift(self, small_wikiann: t.Tuple[TextData, TextData]):
class TestMultiLabelClassification:
- def test_without_drift(self, dummy_multilabel_dataset: TextData):
+ def test_without_drift(self, dummy_multilabel_textdata_train_test):
# Arrange
- train = dummy_multilabel_dataset
+ train, _ = dummy_multilabel_textdata_train_test
train.calculate_default_properties()
check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than()
# Act
@@ -166,4 +166,23 @@ def test_without_drift(self, dummy_multilabel_dataset: TextData):
'Average Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}),
'Subjectivity': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}),
'Max Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'})
- })) # type: ignore
\ No newline at end of file
+ })) # type: ignore
+
+ def test_with_drift(self, dummy_multilabel_textdata_train_test):
+ # Arrange
+ train, test = dummy_multilabel_textdata_train_test
+ train.calculate_default_properties()
+ test.calculate_default_properties()
+ check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3,
+ max_allowed_categorical_score=0.3)
+ # Act
+ result = check.run(train_dataset=train, test_dataset=test)
+ condition_results = check.conditions_decision(result)
+
+ assert_that(condition_results, has_items(
+ equal_condition_result(is_pass=False,
+ details="Failed for 1 out of 6 columns.\nFound 1 "
+ "numeric columns with Kolmogorov-Smirnov above threshold: "
+ "{'Text Length': '0.33'}",
+ name='categorical drift score < 0.3 and numerical drift score < 0.3')
+ ))
diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py
index e663faf0f8..319b74fb34 100644
--- a/tests/nlp/conftest.py
+++ b/tests/nlp/conftest.py
@@ -49,7 +49,6 @@ def tweet_emotion_train_test_probabilities():
return tweet_emotion.load_precalculated_predictions(pred_format='probabilities', as_train_test=True)
-
@pytest.fixture(scope='function')
def text_classification_string_class_dataset_mock():
"""Mock for a text classification dataset with string labels"""
@@ -67,18 +66,13 @@ def text_multilabel_classification_dataset_mock():
@pytest.fixture(scope='function')
-def dummy_multilabel_dataset():
- return TextData(
- raw_text=[
- random.choice(['I think therefore I am', 'I am therefore I think', 'I am'])
- for _ in range(20)
- ],
- label=[
- random.choice([[0, 0, 1], [1, 1, 0], [0, 1, 0]])
- for _ in range(20)
- ],
- task_type='text_classification'
- )
+def dummy_multilabel_textdata_train_test(set_numpy_seed):
+ """Dummy multilabel text classification dataset"""
+ raw_text = [random.choice(['I think therefore I am', 'I am therefore I think', 'I am']) for _ in range(40)] + \
+ ['bla'] * 10
+ label = [random.choice([[0, 0, 1], [1, 1, 0], [0, 1, 0]]) for _ in range(40)] + [[1, 0, 0]] * 10
+ text_data = TextData(raw_text=raw_text, label=label, task_type='text_classification')
+ return text_data.copy(rows_to_use=list(range(20))), text_data.copy(rows_to_use=list(range(20, 50)))
def download_nltk_resources():
@@ -118,15 +112,17 @@ def movie_reviews_data_negative():
neg_data = TextData(random.choices(neg_sentences, k=1000), name='Negative')
return neg_data
+
def _tokenize_raw_text(raw_text):
"""Tokenize raw text"""
return [x.split() for x in raw_text]
+
@pytest.fixture(scope='session')
def text_token_classification_dataset_mock():
"""Mock for a token classification dataset"""
return TextData(tokenized_text=_tokenize_raw_text(['Mary had a little lamb', 'Mary lives in London and Paris',
- 'How much wood can a wood chuck chuck?']),
+ 'How much wood can a wood chuck chuck?']),
label=[['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'],
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']],
task_type='token_classification')
@@ -179,5 +175,3 @@ def _wikiann_to_text_data(wikiann):
],
task_type='token_classification'
)
-
-
diff --git a/tests/tabular/checks/model_evaluation/prediction_drift_test.py b/tests/tabular/checks/model_evaluation/prediction_drift_test.py
index 5a3144fe45..add68eedb6 100644
--- a/tests/tabular/checks/model_evaluation/prediction_drift_test.py
+++ b/tests/tabular/checks/model_evaluation/prediction_drift_test.py
@@ -161,7 +161,7 @@ def test_drift_max_drift_score_condition_fail_psi(drifted_data_and_model):
# Assert
assert_that(condition_result, equal_condition_result(
is_pass=False,
- name='categorical drift score < 0.15 and numerical drift score < 0.15',
+ name='Prediction drift score < 0.15',
details='Found model prediction PSI drift score of 0.79'
))
@@ -215,8 +215,7 @@ def test_drift_max_drift_score_condition_pass_threshold(drifted_data_and_model):
train = remove_label(train)
test = remove_label(test)
check = PredictionDrift(categorical_drift_method='PSI', drift_mode='prediction') \
- .add_condition_drift_score_less_than(max_allowed_categorical_score=1,
- max_allowed_numeric_score=1)
+ .add_condition_drift_score_less_than(max_allowed_drift_score=1)
# Act
result = check.run(train, test, model)
@@ -226,7 +225,7 @@ def test_drift_max_drift_score_condition_pass_threshold(drifted_data_and_model):
assert_that(condition_result, equal_condition_result(
is_pass=True,
details='Found model prediction PSI drift score of 0.79',
- name='categorical drift score < 1 and numerical drift score < 1'
+ name='Prediction drift score < 1'
))
@@ -271,7 +270,7 @@ def test_binary_proba_condition_fail_threshold(drifted_data_and_model):
assert_that(condition_result, equal_condition_result(
is_pass=False,
- name='categorical drift score < 0.15 and numerical drift score < 0.15',
+ name='Prediction drift score < 0.15',
details='Found model prediction Earth Mover\'s Distance drift score of 0.23'
))
@@ -284,7 +283,7 @@ def test_multiclass_proba_reduce_aggregations(iris_split_dataset_and_model_rf):
check = PredictionDrift(categorical_drift_method='PSI', numerical_drift_method='EMD',
max_num_categories=10, min_category_size_ratio=0,
drift_mode='proba', aggregation_method='weighted'
- ).add_condition_drift_score_less_than(max_allowed_numeric_score=0.05)
+ ).add_condition_drift_score_less_than(max_allowed_drift_score=0.05)
# Act
result = check.run(train, test, model)
@@ -315,7 +314,7 @@ def test_multiclass_proba_reduce_aggregations(iris_split_dataset_and_model_rf):
assert_that(condition_result, equal_condition_result(
is_pass=False,
- name='categorical drift score < 0.15 and numerical drift score < 0.05',
+ name='Prediction drift score < 0.05',
details='Found 2 classes with model predicted probability Earth Mover\'s '
'Distance drift score above threshold: 0.05.'
))
diff --git a/tests/tabular/checks/train_test_validation/label_drift_test.py b/tests/tabular/checks/train_test_validation/label_drift_test.py
index 882aa1f017..3e48352a74 100644
--- a/tests/tabular/checks/train_test_validation/label_drift_test.py
+++ b/tests/tabular/checks/train_test_validation/label_drift_test.py
@@ -166,7 +166,7 @@ def test_drift_max_drift_score_condition_fail_psi(drifted_classification_label):
# Assert
assert_that(condition_result, equal_condition_result(
is_pass=False,
- name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift',
+ name='Label drift score < 0.15',
details='Label\'s drift score PSI is 0.24'
))
@@ -184,7 +184,7 @@ def test_drift_max_drift_score_condition_fail_emd(drifted_regression_label):
assert_that(condition_result, equal_condition_result(
is_pass=False,
category=ConditionCategory.FAIL,
- name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift',
+ name='Label drift score < 0.15',
details='Label\'s drift score Earth Mover\'s Distance is 0.34'
))
@@ -193,8 +193,7 @@ def test_drift_max_drift_score_condition_pass_threshold(non_drifted_classificati
# Arrange
train, test = non_drifted_classification_label
check = LabelDrift(categorical_drift_method='PSI') \
- .add_condition_drift_score_less_than(max_allowed_categorical_score=1,
- max_allowed_numeric_score=1)
+ .add_condition_drift_score_less_than(max_allowed_drift_score=1)
# Act
result = check.run(train, test)
@@ -204,5 +203,5 @@ def test_drift_max_drift_score_condition_pass_threshold(non_drifted_classificati
assert_that(condition_result, equal_condition_result(
is_pass=True,
details='Label\'s drift score PSI is 3.37E-3',
- name='categorical drift score < 1 and numerical drift score < 1 for label drift'
+ name='Label drift score < 1'
))