diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py index c14bc2bca1..4c797ea56e 100644 --- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py +++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py @@ -10,16 +10,12 @@ # """Module contains Prediction Drift check.""" -from typing import Dict - import numpy as np -from deepchecks.core import CheckResult, ConditionCategory, ConditionResult +from deepchecks.core import CheckResult from deepchecks.core.errors import DeepchecksValueError from deepchecks.nlp import Context, TrainTestCheck from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract -from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS -from deepchecks.utils.strings import format_number __all__ = ['PredictionDrift'] @@ -162,53 +158,5 @@ def run_logic(self, context: Context) -> CheckResult: train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1)) test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1)) - return self.prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display, - proba_drift, not proba_drift) - - def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15, - max_allowed_numeric_score: float = 0.15): - """ - Add condition - require drift score to be less than a certain threshold. - - The industry standard for PSI limit is above 0.2. - There are no common industry standards for other drift methods, such as Cramer's V, - Kolmogorov-Smirnov and Earth Mover's Distance. - The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction - drift. - - Parameters - ---------- - max_allowed_categorical_score: float , default: 0.15 - the max threshold for the categorical variable drift score - max_allowed_numeric_score: float , default: 0.15 - the max threshold for the numeric variable drift score - Returns - ------- - ConditionResult - False if any distribution has passed the max threshold, True otherwise - """ - - def condition(result: Dict) -> ConditionResult: - drift_score_dict = result['Drift score'] - # Move to dict for easier looping - if not isinstance(drift_score_dict, dict): - drift_score_dict = {0: drift_score_dict} - method = result['Method'] - has_failed = {} - drift_score = 0 - for class_name, drift_score in drift_score_dict.items(): - has_failed[class_name] = \ - (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \ - (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS) - - if len(has_failed) == 1: - details = f'Found model prediction {method} drift score of {format_number(drift_score)}' - else: - details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \ - f' score above threshold: {max_allowed_numeric_score}.' - category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS - return ConditionResult(category, details) - - return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and ' - f'numerical drift score < {max_allowed_numeric_score}', - condition) + return self._prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display, + proba_drift, not proba_drift) diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py index b9f062b37e..98ed014215 100644 --- a/deepchecks/nlp/checks/train_test_validation/label_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py @@ -8,24 +8,17 @@ # along with Deepchecks. If not, see . # ---------------------------------------------------------------------------- # -"""Module contains Label Drift check.""" - -from typing import Dict -import pandas as pd +"""Module contains Label Drift check.""" -from deepchecks.core import CheckResult, ConditionResult -from deepchecks.core.condition import ConditionCategory -from deepchecks.core.errors import DeepchecksValueError +from deepchecks.core import CheckResult from deepchecks.nlp import Context, TrainTestCheck -from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS, - calc_drift_and_plot, get_drift_plot_sidenote) -from deepchecks.utils.strings import format_number +from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract __all__ = ['LabelDrift'] -class LabelDrift(TrainTestCheck): +class LabelDrift(TrainTestCheck, LabelDriftAbstract): """ Calculate label drift between train dataset and test dataset, using statistical measures. @@ -41,15 +34,17 @@ class LabelDrift(TrainTestCheck): small number of samples (common practice is categories with less than 5 samples). However, in cases of a variable with many categories with few samples, it is still recommended to use Cramer's V. + **Note:** In case of highly imbalanced classes, it is recommended to use Cramer's V, together with setting + the ``balance_classes`` parameter to ``True``. Parameters ---------- min_category_size_ratio: float, default 0.01 minimum size ratio for categories. Categories with size ratio lower than this number are binned - into an "Other" category. + into an "Other" category. Ignored if balance_classes=True. max_num_categories_for_drift: int, default: None - Max number of allowed categories. If there are more, - they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots + Only for classification. Max number of allowed categories. If there are more, + they are binned into an "Other" category. max_num_categories_for_display: int, default: 10 Max number of categories to show in plot. show_categories_by: str, default: 'largest_difference' @@ -58,23 +53,24 @@ class LabelDrift(TrainTestCheck): - 'train_largest': Show the largest train categories. - 'test_largest': Show the largest test categories. - 'largest_difference': Show the largest difference between categories. - numerical_drift_method: str, default: "KS" - decides which method to use on numerical variables. Possible values are: - "EMD" for Earth Mover's Distance (EMD), "KS" for Kolmogorov-Smirnov (KS). categorical_drift_method: str, default: "cramers_v" decides which method to use on categorical variables. Possible values are: "cramers_v" for Cramer's V, "PSI" for Population Stability Index (PSI). balance_classes: bool, default: False If True, all categories will have an equal weight in the Cramer's V score. This is useful when the categorical variable is highly imbalanced, and we want to be alerted on changes in proportion to the category size, - and not only to the entire dataset. Must have categorical_drift_method = "cramers_v" and - drift_mode = "auto" or "prediction". + and not only to the entire dataset. Must have categorical_drift_method = "cramers_v". If True, the variable frequency plot will be created with a log scale in the y-axis. - ignore_na: bool, default True + ignore_na: bool, default False For categorical columns only. If True, ignores nones for categorical drift. If False, considers none as a separate category. For numerical columns we always ignore nones. + min_samples : int , default: 10 + Minimum number of samples required to calculate the drift score. If there are not enough samples for either + train or test, the check will raise a ``NotEnoughSamplesError`` exception. n_samples : int , default: 100_000 Number of samples to use for drift computation and plot. + random_state : int , default: 42 + Random seed for sampling. """ def __init__( @@ -83,29 +79,30 @@ def __init__( min_category_size_ratio: float = 0.01, max_num_categories_for_display: int = 10, show_categories_by: str = 'largest_difference', - numerical_drift_method: str = 'KS', categorical_drift_method: str = 'cramers_v', balance_classes: bool = False, - ignore_na: bool = True, + ignore_na: bool = False, + min_samples: int = 10, n_samples: int = 100_000, + random_state: int = 42, **kwargs ): - if show_categories_by not in ('train_largest', 'test_largest', 'largest_difference'): - raise DeepchecksValueError( - 'show_categories_by must be one of "train_largest", "test_largest", "largest_difference"') super().__init__(**kwargs) + # self.margin_quantile_filter = margin_quantile_filter self.max_num_categories_for_drift = max_num_categories_for_drift self.min_category_size_ratio = min_category_size_ratio self.max_num_categories_for_display = max_num_categories_for_display self.show_categories_by = show_categories_by - self.numerical_drift_method = numerical_drift_method + # self.numerical_drift_method = numerical_drift_method self.categorical_drift_method = categorical_drift_method self.balance_classes = balance_classes self.ignore_na = ignore_na + self.min_samples = min_samples self.n_samples = n_samples + self.random_state = random_state def run_logic(self, context: Context) -> CheckResult: - """Calculate drift for the label. + """Calculate drift for all columns. Returns ------- @@ -113,74 +110,8 @@ def run_logic(self, context: Context) -> CheckResult: value: drift score. display: label distribution graph, comparing the train and test distributions. """ - context.raise_if_token_classification_task(self) - context.raise_if_multi_label_task(self) - - train_dataset = context.train.sample(self.n_samples, random_state=context.random_state) - test_dataset = context.test.sample(self.n_samples, random_state=context.random_state) - - drift_score, method, display = calc_drift_and_plot( - train_column=pd.Series(train_dataset.label), - test_column=pd.Series(test_dataset.label), - value_name='Label', - column_type='categorical', - max_num_categories_for_drift=self.max_num_categories_for_drift, - min_category_size_ratio=self.min_category_size_ratio, - max_num_categories_for_display=self.max_num_categories_for_display, - show_categories_by=self.show_categories_by, - numerical_drift_method=self.numerical_drift_method, - categorical_drift_method=self.categorical_drift_method, - balance_classes=self.balance_classes, - ignore_na=self.ignore_na, - with_display=context.with_display, - dataset_names=(train_dataset.name, test_dataset.name) - ) - - values_dict = {'Drift score': drift_score, 'Method': method} - - if context.with_display: - displays = [""" - The Drift score is a measure for the difference between two distributions, in this check - the test - and train distributions.
The check shows the drift score and distributions for the label. -
""", get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display] - else: - displays = None - - return CheckResult(value=values_dict, display=displays, header='Train Test Label Drift') - - def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15, - max_allowed_numeric_score: float = 0.15): - """ - Add condition - require drift score to be less than the threshold. - - The industry standard for PSI limit is above 0.2. - There are no common industry standards for other drift methods, such as Cramer's V, - Kolmogorov-Smirnov and Earth Mover's Distance. - The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction - drift. - - Parameters - ---------- - max_allowed_categorical_score: float , default: 0.2 - the max threshold for the categorical variable drift score - max_allowed_numeric_score: float , default: 0.15 - the max threshold for the numeric variable drift score - Returns - ------- - ConditionResult - False if any column has passed the max threshold, True otherwise - """ - - def condition(result: Dict) -> ConditionResult: - drift_score = result['Drift score'] - method = result['Method'] - has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \ - (drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS) - - details = f'Label\'s drift score {method} is {format_number(drift_score)}' - category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS - return ConditionResult(category, details) + train_dataset = context.train.sample(self.n_samples, random_state=self.random_state) + test_dataset = context.test.sample(self.n_samples, random_state=self.random_state) - return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and ' - f'numerical drift score < {max_allowed_numeric_score} for label drift', - condition) + return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label', + 'categorical', context.with_display, (train_dataset.name, test_dataset.name)) diff --git a/deepchecks/tabular/checks/model_evaluation/prediction_drift.py b/deepchecks/tabular/checks/model_evaluation/prediction_drift.py index f8670c440a..c13e8bbf58 100644 --- a/deepchecks/tabular/checks/model_evaluation/prediction_drift.py +++ b/deepchecks/tabular/checks/model_evaluation/prediction_drift.py @@ -15,14 +15,12 @@ import numpy as np -from deepchecks.core import CheckResult, ConditionCategory, ConditionResult +from deepchecks.core import CheckResult from deepchecks.core.errors import DeepchecksValueError from deepchecks.core.reduce_classes import ReduceMixin from deepchecks.tabular import Context, TrainTestCheck from deepchecks.tabular.utils.task_type import TaskType from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract -from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS -from deepchecks.utils.strings import format_number __all__ = ['PredictionDrift'] @@ -193,8 +191,8 @@ def run_logic(self, context: Context) -> CheckResult: train_pred = np.array(model.predict(train_dataset.features_columns)).reshape((-1, 1)) test_pred = np.array(model.predict(test_dataset.features_columns)).reshape((-1, 1)) - return self.prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift, - (context.task_type != TaskType.REGRESSION) and (not proba_drift)) + return self._prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift, + (context.task_type != TaskType.REGRESSION) and (not proba_drift)) def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]: """Return prediction drift score.""" @@ -217,51 +215,3 @@ def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]: def greater_is_better(self): """Return True if the check reduce_output is better when it is greater.""" return False - - def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15, - max_allowed_numeric_score: float = 0.15): - """ - Add condition - require drift score to be less than a certain threshold. - - The industry standard for PSI limit is above 0.2. - There are no common industry standards for other drift methods, such as Cramer's V, - Kolmogorov-Smirnov and Earth Mover's Distance. - The threshold was lowered by 25% compared to feature drift defaults due to the higher importance of prediction - drift. - - Parameters - ---------- - max_allowed_categorical_score: float , default: 0.15 - the max threshold for the categorical variable drift score - max_allowed_numeric_score: float , default: 0.15 - the max threshold for the numeric variable drift score - Returns - ------- - ConditionResult - False if any column has passed the max threshold, True otherwise - """ - - def condition(result: t.Dict) -> ConditionResult: - drift_score_dict = result['Drift score'] - # Move to dict for easier looping - if not isinstance(drift_score_dict, dict): - drift_score_dict = {0: drift_score_dict} - method = result['Method'] - has_failed = {} - drift_score = 0 - for class_name, drift_score in drift_score_dict.items(): - has_failed[class_name] = \ - (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \ - (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS) - - if len(has_failed) == 1: - details = f'Found model prediction {method} drift score of {format_number(drift_score)}' - else: - details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \ - f' score above threshold: {max_allowed_numeric_score}.' - category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS - return ConditionResult(category, details) - - return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and ' - f'numerical drift score < {max_allowed_numeric_score}', - condition) diff --git a/deepchecks/tabular/checks/train_test_validation/label_drift.py b/deepchecks/tabular/checks/train_test_validation/label_drift.py index b89de4e02c..0f475b8907 100644 --- a/deepchecks/tabular/checks/train_test_validation/label_drift.py +++ b/deepchecks/tabular/checks/train_test_validation/label_drift.py @@ -13,19 +13,16 @@ from typing import Dict -from deepchecks.core import CheckResult, ConditionResult -from deepchecks.core.condition import ConditionCategory +from deepchecks.core import CheckResult from deepchecks.core.reduce_classes import ReduceLabelMixin from deepchecks.tabular import Context, TrainTestCheck from deepchecks.tabular.utils.task_type import TaskType -from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS, - calc_drift_and_plot, get_drift_plot_sidenote) -from deepchecks.utils.strings import format_number +from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract __all__ = ['LabelDrift'] -class LabelDrift(TrainTestCheck, ReduceLabelMixin): +class LabelDrift(TrainTestCheck, LabelDriftAbstract, ReduceLabelMixin): """ Calculate label drift between train dataset and test dataset, using statistical measures. @@ -134,37 +131,10 @@ def run_logic(self, context: Context) -> CheckResult: train_dataset = context.train.sample(self.n_samples, random_state=self.random_state) test_dataset = context.test.sample(self.n_samples, random_state=self.random_state) - drift_score, method, display = calc_drift_and_plot( - train_column=train_dataset.label_col, - test_column=test_dataset.label_col, - value_name=train_dataset.label_name, - column_type='categorical' if context.task_type != TaskType.REGRESSION else 'numerical', - margin_quantile_filter=self.margin_quantile_filter, - max_num_categories_for_drift=self.max_num_categories_for_drift, - min_category_size_ratio=self.min_category_size_ratio, - max_num_categories_for_display=self.max_num_categories_for_display, - show_categories_by=self.show_categories_by, - numerical_drift_method=self.numerical_drift_method, - categorical_drift_method=self.categorical_drift_method, - balance_classes=self.balance_classes, - ignore_na=self.ignore_na, - min_samples=self.min_samples, - raise_min_samples_error=True, - with_display=context.with_display, - dataset_names=(train_dataset.name, test_dataset.name) - ) - - values_dict = {'Drift score': drift_score, 'Method': method} - - if context.with_display: - displays = [""" - The Drift score is a measure for the difference between two distributions, in this check - the test - and train distributions.
The check shows the drift score and distributions for the label. -
""", get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display] - else: - displays = None - - return CheckResult(value=values_dict, display=displays, header='Label Drift') + column_type = 'categorical' if context.task_type != TaskType.REGRESSION else 'numerical' + + return self._calculate_label_drift(train_dataset.label_col, test_dataset.label_col, train_dataset.label_name, + column_type, context.with_display, (train_dataset.name, test_dataset.name)) def reduce_output(self, check_result: CheckResult) -> Dict[str, float]: """Return label drift score.""" @@ -173,37 +143,3 @@ def reduce_output(self, check_result: CheckResult) -> Dict[str, float]: def greater_is_better(self): """Return True if the check reduce_output is better when it is greater.""" return False - - def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15, - max_allowed_numeric_score: float = 0.15): - """ - Add condition - require drift score to be less than the threshold. - - The industry standard for PSI limit is above 0.2. - There are no common industry standards for other drift methods, such as Cramer's V, - Kolmogorov-Smirnov and Earth Mover's Distance. - - Parameters - ---------- - max_allowed_categorical_score: float , default: 0.15 - the max threshold for the categorical variable drift score - max_allowed_numeric_score: float , default: 0.15 - the max threshold for the numeric variable drift score - Returns - ------- - ConditionResult - False if any column has passed the max threshold, True otherwise - """ - def condition(result: Dict) -> ConditionResult: - drift_score = result['Drift score'] - method = result['Method'] - has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \ - (drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS) - - details = f'Label\'s drift score {method} is {format_number(drift_score)}' - category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS - return ConditionResult(category, details) - - return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and ' - f'numerical drift score < {max_allowed_numeric_score} for label drift', - condition) diff --git a/deepchecks/utils/abstracts/label_drift.py b/deepchecks/utils/abstracts/label_drift.py new file mode 100644 index 0000000000..c362dd3f5f --- /dev/null +++ b/deepchecks/utils/abstracts/label_drift.py @@ -0,0 +1,104 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""The base abstract functionality for label drift checks.""" +import abc +import typing as t + +import pandas as pd + +from deepchecks import CheckResult, ConditionCategory, ConditionResult +from deepchecks.utils.distribution.drift import calc_drift_and_plot, get_drift_plot_sidenote +from deepchecks.utils.strings import format_number + +__all__ = ['LabelDriftAbstract'] + + +class LabelDriftAbstract(abc.ABC): + """Base class for label drift checks.""" + + margin_quantile_filter: float = 0.025 + max_num_categories_for_drift: t.Optional[int] + min_category_size_ratio: float + max_num_categories_for_display: t.Optional[int] + show_categories_by: str + numerical_drift_method: str = 'KS' + categorical_drift_method: str + balance_classes: bool + ignore_na: bool + min_samples: int + n_samples: t.Optional[int] + random_state: int + add_condition: t.Callable[..., t.Any] + + def _calculate_label_drift(self, train_column, test_column, label_name: str, column_type: str, with_display: bool, + dataset_names: t.Optional[t.Tuple[str, str]]) -> CheckResult: + + drift_score, method, display = calc_drift_and_plot( + train_column=pd.Series(train_column), + test_column=pd.Series(test_column), + value_name=label_name, + column_type=column_type, + margin_quantile_filter=self.margin_quantile_filter, + max_num_categories_for_drift=self.max_num_categories_for_drift, + min_category_size_ratio=self.min_category_size_ratio, + max_num_categories_for_display=self.max_num_categories_for_display, + show_categories_by=self.show_categories_by, + numerical_drift_method=self.numerical_drift_method, + categorical_drift_method=self.categorical_drift_method, + balance_classes=self.balance_classes, + ignore_na=self.ignore_na, + min_samples=self.min_samples, + raise_min_samples_error=True, + with_display=with_display, + dataset_names=dataset_names + ) + + values_dict = {'Drift score': drift_score, 'Method': method} + + if with_display: + displays = [""" + The Drift score is a measure for the difference between two distributions, in this check - + the test and train distributions.
The check shows the drift score + and distributions for the label.
""", + get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), + display] + else: + displays = None + + return CheckResult(value=values_dict, display=displays, header='Label Drift') + + def add_condition_drift_score_less_than(self, max_allowed_drift_score: float = 0.15): + """ + Add condition - require drift score to be less than the threshold. + + The industry standard for PSI limit is above 0.2. + There are no common industry standards for other drift methods, such as Cramer's V, + Kolmogorov-Smirnov and Earth Mover's Distance. + + Parameters + ---------- + max_allowed_drift_score: float , default: 0.15 + the max threshold for the categorical variable drift score + Returns + ------- + ConditionResult + False if any column has passed the max threshold, True otherwise + """ + + def condition(result: t.Dict) -> ConditionResult: + drift_score = result['Drift score'] + method = result['Method'] + + details = f'Label\'s drift score {method} is {format_number(drift_score)}' + category = ConditionCategory.FAIL if drift_score > max_allowed_drift_score else ConditionCategory.PASS + return ConditionResult(category, details) + + return self.add_condition(f'Label drift score < {max_allowed_drift_score}', condition) diff --git a/deepchecks/utils/abstracts/prediction_drift.py b/deepchecks/utils/abstracts/prediction_drift.py index b97e77fbdc..8e2a7eded2 100644 --- a/deepchecks/utils/abstracts/prediction_drift.py +++ b/deepchecks/utils/abstracts/prediction_drift.py @@ -8,19 +8,42 @@ # along with Deepchecks. If not, see . # ---------------------------------------------------------------------------- # -"""Module contains the Abstract cass for Prediction Drift checks.""" +"""The base abstract functionality for prediction drift checks.""" +import abc +import typing as t + import numpy as np import pandas as pd -from deepchecks import CheckResult +from deepchecks import CheckResult, ConditionCategory, ConditionResult from deepchecks.utils.distribution.drift import calc_drift_and_plot, get_drift_plot_sidenote +from deepchecks.utils.strings import format_number + +__all__ = ['PredictionDriftAbstract'] -class PredictionDriftAbstract: - """Abstract class for prediction drift checks.""" +class PredictionDriftAbstract(abc.ABC): + """Base class for prediction drift checks.""" - def prediction_drift(self, train_prediction, test_prediction, model_classes, with_display, - proba_drift, cat_plot) -> CheckResult: + drift_mode: str = 'auto' + margin_quantile_filter: float = 0.025 + max_num_categories_for_drift: int = None + min_category_size_ratio: float = 0.01 + max_num_categories_for_display: int = 10 + show_categories_by: str = 'largest_difference' + numerical_drift_method: str = 'KS' + categorical_drift_method: str = 'cramers_v' + balance_classes: bool = False + ignore_na: bool = True + aggregation_method: t.Optional[str] = 'max' + max_classes_to_display: int = 3 + min_samples: t.Optional[int] = 10 + n_samples: int = 100_000 + random_state: int = 42 + add_condition: t.Callable[..., t.Any] + + def _prediction_drift(self, train_prediction, test_prediction, model_classes, with_display, + proba_drift, cat_plot) -> CheckResult: """Calculate prediction drift. Args: @@ -116,3 +139,43 @@ def prediction_drift(self, train_prediction, test_prediction, model_classes, wit 'Method': method, 'Samples per class': samples_per_class} return CheckResult(value=values_dict, display=displays, header='Prediction Drift') + + def add_condition_drift_score_less_than(self, max_allowed_drift_score: float = 0.15): + """ + Add condition - require drift score to be less than the threshold. + + The industry standard for PSI limit is above 0.2. + There are no common industry standards for other drift methods, such as Cramer's V, + Kolmogorov-Smirnov and Earth Mover's Distance. + + Parameters + ---------- + max_allowed_drift_score: float , default: 0.15 + the max threshold for the categorical variable drift score + Returns + ------- + ConditionResult + False if any column has passed the max threshold, True otherwise + """ + + def condition(result: t.Dict) -> ConditionResult: + drift_score_dict = result['Drift score'] + # Move to dict for easier looping + if not isinstance(drift_score_dict, dict): + drift_score_dict = {0: drift_score_dict} + method = result['Method'] + has_failed = {} + drift_score = 0 + for class_name, drift_score in drift_score_dict.items(): + has_failed[class_name] = drift_score > max_allowed_drift_score + + if len(has_failed) == 1: + details = f'Found model prediction {method} drift score of {format_number(drift_score)}' + else: + details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \ + f' score above threshold: {max_allowed_drift_score}.' + + category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS + return ConditionResult(category, details) + + return self.add_condition(f'Prediction drift score < {max_allowed_drift_score}', condition) diff --git a/tests/conftest.py b/tests/conftest.py index e8013966fe..c61bb17eab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,10 +10,12 @@ # """Represents fixtures for unit testing using pytest.""" import logging +import random # pylint: skip-file from typing import Tuple import matplotlib.pyplot as plt +import numpy as np import pandas as pd import pytest from sklearn.datasets import load_diabetes, load_iris @@ -30,6 +32,12 @@ set_verbosity(logging.WARNING) +@pytest.fixture(scope='function') +def set_numpy_seed(): + np.random.seed(42) + random.seed(42) + + @pytest.fixture(scope='session') def multi_index_dataframe(): """Return a multi-indexed DataFrame.""" diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py index 1e8a24369f..7c28711bb1 100644 --- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py +++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py @@ -31,7 +31,7 @@ def test_tweet_emotion(tweet_emotion_train_test_textdata, tweet_emotion_train_te assert_that(condition_result, has_items( equal_condition_result(is_pass=False, details="Found model prediction Cramer's V drift score of 0.04", - name='categorical drift score < 0.01 and numerical drift score < 0.15') + name='Prediction drift score < 0.01') )) assert_that(result.value['Drift score'], close_to(0.04, 0.01)) @@ -50,7 +50,7 @@ def test_tweet_emotion_no_drift(tweet_emotion_train_test_textdata, tweet_emotion assert_that(condition_result, has_items( equal_condition_result(is_pass=True, details="Found model prediction Cramer's V drift score of 0", - name='categorical drift score < 0.15 and numerical drift score < 0.15') + name='Prediction drift score < 0.15') )) assert_that(result.value['Drift score'], equal_to(0)) @@ -71,7 +71,7 @@ def test_tweet_emotion_no_drift_no_label(tweet_emotion_train_test_textdata, twee assert_that(condition_result, has_items( equal_condition_result(is_pass=True, details="Found model prediction Cramer's V drift score of 0", - name='categorical drift score < 0.15 and numerical drift score < 0.15') + name='Prediction drift score < 0.15') )) assert_that(result.value['Drift score'], equal_to(0)) diff --git a/tests/nlp/checks/train_test_validation/label_drift_test.py b/tests/nlp/checks/train_test_validation/label_drift_test.py index aaa6a006a9..29e6acf9d5 100644 --- a/tests/nlp/checks/train_test_validation/label_drift_test.py +++ b/tests/nlp/checks/train_test_validation/label_drift_test.py @@ -28,7 +28,7 @@ def test_tweet_emotion(tweet_emotion_train_test_textdata): assert_that(condition_result, has_items( equal_condition_result(is_pass=False, details="Label's drift score Cramer's V is 0.22", - name='categorical drift score < 0.1 and numerical drift score < 0.15 for label drift') + name='Label drift score < 0.1') )) assert_that(result.value['Drift score'], close_to(0.23, 0.01)) @@ -46,7 +46,26 @@ def test_tweet_emotion_no_drift(tweet_emotion_train_test_textdata): assert_that(condition_result, has_items( equal_condition_result(is_pass=True, details="Label's drift score Cramer's V is 0", - name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift') + name='Label drift score < 0.15') )) assert_that(result.value['Drift score'], close_to(0, 0.01)) + + +def test_multi_label_without_drift(dummy_multilabel_textdata_train_test): + # Arrange + train, test = dummy_multilabel_textdata_train_test + check = LabelDrift(min_samples=20).add_condition_drift_score_less_than() + # Act + result = check.run(train_dataset=train, test_dataset=test) + condition_result = check.conditions_decision(result) + + # Assert + assert_that(condition_result, has_items( + equal_condition_result(is_pass=True, + details="Label's drift score Cramer's V is 0", + name='Label drift score < 0.15') + )) + assert_that(result.value['Drift score'], close_to(0, 0.01)) + + diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py index fdffa13f4a..8d820b77bb 100644 --- a/tests/nlp/checks/train_test_validation/property_drift_test.py +++ b/tests/nlp/checks/train_test_validation/property_drift_test.py @@ -15,6 +15,7 @@ from deepchecks.nlp.checks import PropertyDrift from deepchecks.nlp.text_data import TextData +from tests.base.utils import equal_condition_result class TestTextClassification: @@ -31,10 +32,10 @@ def test_without_drift(self, tweet_emotion_train_test_textdata): assert condition_results[0].is_pass() is True assert_that(result.value, has_entries({ - "Formality": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov","Importance": None}, + "Formality": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Language": {"Drift score": 0.0, "Method": "Cramer's V", "Importance": None}, "Subjectivity": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, - "Average Word Length": {"Drift score": 0.0,"Method": "Kolmogorov-Smirnov", "Importance": None}, + "Average Word Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Text Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Max Word Length": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, "Toxicity": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, @@ -43,7 +44,6 @@ def test_without_drift(self, tweet_emotion_train_test_textdata): "Fluency": {"Drift score": 0.0, "Method": "Kolmogorov-Smirnov", "Importance": None}, })) # type: ignore - def test_with_drift(self, tweet_emotion_train_test_textdata): # Arrange train, test = tweet_emotion_train_test_textdata @@ -147,9 +147,9 @@ def test_with_drift(self, small_wikiann: t.Tuple[TextData, TextData]): class TestMultiLabelClassification: - def test_without_drift(self, dummy_multilabel_dataset: TextData): + def test_without_drift(self, dummy_multilabel_textdata_train_test): # Arrange - train = dummy_multilabel_dataset + train, _ = dummy_multilabel_textdata_train_test train.calculate_default_properties() check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than() # Act @@ -166,4 +166,23 @@ def test_without_drift(self, dummy_multilabel_dataset: TextData): 'Average Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), 'Subjectivity': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}), 'Max Word Length': has_entries({'Drift score': 0.0, 'Method': 'Kolmogorov-Smirnov'}) - })) # type: ignore \ No newline at end of file + })) # type: ignore + + def test_with_drift(self, dummy_multilabel_textdata_train_test): + # Arrange + train, test = dummy_multilabel_textdata_train_test + train.calculate_default_properties() + test.calculate_default_properties() + check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3, + max_allowed_categorical_score=0.3) + # Act + result = check.run(train_dataset=train, test_dataset=test) + condition_results = check.conditions_decision(result) + + assert_that(condition_results, has_items( + equal_condition_result(is_pass=False, + details="Failed for 1 out of 6 columns.\nFound 1 " + "numeric columns with Kolmogorov-Smirnov above threshold: " + "{'Text Length': '0.33'}", + name='categorical drift score < 0.3 and numerical drift score < 0.3') + )) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index e663faf0f8..319b74fb34 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -49,7 +49,6 @@ def tweet_emotion_train_test_probabilities(): return tweet_emotion.load_precalculated_predictions(pred_format='probabilities', as_train_test=True) - @pytest.fixture(scope='function') def text_classification_string_class_dataset_mock(): """Mock for a text classification dataset with string labels""" @@ -67,18 +66,13 @@ def text_multilabel_classification_dataset_mock(): @pytest.fixture(scope='function') -def dummy_multilabel_dataset(): - return TextData( - raw_text=[ - random.choice(['I think therefore I am', 'I am therefore I think', 'I am']) - for _ in range(20) - ], - label=[ - random.choice([[0, 0, 1], [1, 1, 0], [0, 1, 0]]) - for _ in range(20) - ], - task_type='text_classification' - ) +def dummy_multilabel_textdata_train_test(set_numpy_seed): + """Dummy multilabel text classification dataset""" + raw_text = [random.choice(['I think therefore I am', 'I am therefore I think', 'I am']) for _ in range(40)] + \ + ['bla'] * 10 + label = [random.choice([[0, 0, 1], [1, 1, 0], [0, 1, 0]]) for _ in range(40)] + [[1, 0, 0]] * 10 + text_data = TextData(raw_text=raw_text, label=label, task_type='text_classification') + return text_data.copy(rows_to_use=list(range(20))), text_data.copy(rows_to_use=list(range(20, 50))) def download_nltk_resources(): @@ -118,15 +112,17 @@ def movie_reviews_data_negative(): neg_data = TextData(random.choices(neg_sentences, k=1000), name='Negative') return neg_data + def _tokenize_raw_text(raw_text): """Tokenize raw text""" return [x.split() for x in raw_text] + @pytest.fixture(scope='session') def text_token_classification_dataset_mock(): """Mock for a token classification dataset""" return TextData(tokenized_text=_tokenize_raw_text(['Mary had a little lamb', 'Mary lives in London and Paris', - 'How much wood can a wood chuck chuck?']), + 'How much wood can a wood chuck chuck?']), label=[['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']], task_type='token_classification') @@ -179,5 +175,3 @@ def _wikiann_to_text_data(wikiann): ], task_type='token_classification' ) - - diff --git a/tests/tabular/checks/model_evaluation/prediction_drift_test.py b/tests/tabular/checks/model_evaluation/prediction_drift_test.py index 5a3144fe45..add68eedb6 100644 --- a/tests/tabular/checks/model_evaluation/prediction_drift_test.py +++ b/tests/tabular/checks/model_evaluation/prediction_drift_test.py @@ -161,7 +161,7 @@ def test_drift_max_drift_score_condition_fail_psi(drifted_data_and_model): # Assert assert_that(condition_result, equal_condition_result( is_pass=False, - name='categorical drift score < 0.15 and numerical drift score < 0.15', + name='Prediction drift score < 0.15', details='Found model prediction PSI drift score of 0.79' )) @@ -215,8 +215,7 @@ def test_drift_max_drift_score_condition_pass_threshold(drifted_data_and_model): train = remove_label(train) test = remove_label(test) check = PredictionDrift(categorical_drift_method='PSI', drift_mode='prediction') \ - .add_condition_drift_score_less_than(max_allowed_categorical_score=1, - max_allowed_numeric_score=1) + .add_condition_drift_score_less_than(max_allowed_drift_score=1) # Act result = check.run(train, test, model) @@ -226,7 +225,7 @@ def test_drift_max_drift_score_condition_pass_threshold(drifted_data_and_model): assert_that(condition_result, equal_condition_result( is_pass=True, details='Found model prediction PSI drift score of 0.79', - name='categorical drift score < 1 and numerical drift score < 1' + name='Prediction drift score < 1' )) @@ -271,7 +270,7 @@ def test_binary_proba_condition_fail_threshold(drifted_data_and_model): assert_that(condition_result, equal_condition_result( is_pass=False, - name='categorical drift score < 0.15 and numerical drift score < 0.15', + name='Prediction drift score < 0.15', details='Found model prediction Earth Mover\'s Distance drift score of 0.23' )) @@ -284,7 +283,7 @@ def test_multiclass_proba_reduce_aggregations(iris_split_dataset_and_model_rf): check = PredictionDrift(categorical_drift_method='PSI', numerical_drift_method='EMD', max_num_categories=10, min_category_size_ratio=0, drift_mode='proba', aggregation_method='weighted' - ).add_condition_drift_score_less_than(max_allowed_numeric_score=0.05) + ).add_condition_drift_score_less_than(max_allowed_drift_score=0.05) # Act result = check.run(train, test, model) @@ -315,7 +314,7 @@ def test_multiclass_proba_reduce_aggregations(iris_split_dataset_and_model_rf): assert_that(condition_result, equal_condition_result( is_pass=False, - name='categorical drift score < 0.15 and numerical drift score < 0.05', + name='Prediction drift score < 0.05', details='Found 2 classes with model predicted probability Earth Mover\'s ' 'Distance drift score above threshold: 0.05.' )) diff --git a/tests/tabular/checks/train_test_validation/label_drift_test.py b/tests/tabular/checks/train_test_validation/label_drift_test.py index 882aa1f017..3e48352a74 100644 --- a/tests/tabular/checks/train_test_validation/label_drift_test.py +++ b/tests/tabular/checks/train_test_validation/label_drift_test.py @@ -166,7 +166,7 @@ def test_drift_max_drift_score_condition_fail_psi(drifted_classification_label): # Assert assert_that(condition_result, equal_condition_result( is_pass=False, - name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift', + name='Label drift score < 0.15', details='Label\'s drift score PSI is 0.24' )) @@ -184,7 +184,7 @@ def test_drift_max_drift_score_condition_fail_emd(drifted_regression_label): assert_that(condition_result, equal_condition_result( is_pass=False, category=ConditionCategory.FAIL, - name='categorical drift score < 0.15 and numerical drift score < 0.15 for label drift', + name='Label drift score < 0.15', details='Label\'s drift score Earth Mover\'s Distance is 0.34' )) @@ -193,8 +193,7 @@ def test_drift_max_drift_score_condition_pass_threshold(non_drifted_classificati # Arrange train, test = non_drifted_classification_label check = LabelDrift(categorical_drift_method='PSI') \ - .add_condition_drift_score_less_than(max_allowed_categorical_score=1, - max_allowed_numeric_score=1) + .add_condition_drift_score_less_than(max_allowed_drift_score=1) # Act result = check.run(train, test) @@ -204,5 +203,5 @@ def test_drift_max_drift_score_condition_pass_threshold(non_drifted_classificati assert_that(condition_result, equal_condition_result( is_pass=True, details='Label\'s drift score PSI is 3.37E-3', - name='categorical drift score < 1 and numerical drift score < 1 for label drift' + name='Label drift score < 1' ))