Nb/feat/multi label (#2467)

Support multi label prediction and label drift + abstraction
deepchecks · Apr 23, 2023 · 1bec6b5 · 1bec6b5
1 parent 53c8d87
commit 1bec6b5
Show file tree

Hide file tree

Showing 13 changed files with 291 additions and 321 deletions.
diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
@@ -10,16 +10,12 @@
 #
 """Module contains Prediction Drift check."""
 
-from typing import Dict
-
 import numpy as np
 
-from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.core import CheckResult
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp import Context, TrainTestCheck
 from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
-from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
-from deepchecks.utils.strings import format_number
 
 __all__ = ['PredictionDrift']
 
@@ -162,53 +158,5 @@ def run_logic(self, context: Context) -> CheckResult:
             train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1))
             test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1))
 
-        return self.prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
-                                     proba_drift, not proba_drift)
-
-    def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
-                                            max_allowed_numeric_score: float = 0.15):
-        """
-        Add condition - require drift score to be less than a certain threshold.
-
-        The industry standard for PSI limit is above 0.2.
-        There are no common industry standards for other drift methods, such as Cramer's V,
-        Kolmogorov-Smirnov and Earth Mover's Distance.
-        The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
-        drift.
-
-        Parameters
-        ----------
-        max_allowed_categorical_score: float , default: 0.15
-            the max threshold for the categorical variable drift score
-        max_allowed_numeric_score: float , default: 0.15
-            the max threshold for the numeric variable drift score
-        Returns
-        -------
-        ConditionResult
-            False if any distribution has passed the max threshold, True otherwise
-        """
-
-        def condition(result: Dict) -> ConditionResult:
-            drift_score_dict = result['Drift score']
-            # Move to dict for easier looping
-            if not isinstance(drift_score_dict, dict):
-                drift_score_dict = {0: drift_score_dict}
-            method = result['Method']
-            has_failed = {}
-            drift_score = 0
-            for class_name, drift_score in drift_score_dict.items():
-                has_failed[class_name] = \
-                    (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
-                    (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
-            if len(has_failed) == 1:
-                details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
-            else:
-                details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
-                          f' score above threshold: {max_allowed_numeric_score}.'
-            category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
-            return ConditionResult(category, details)
-
-        return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
-                                  f'numerical drift score < {max_allowed_numeric_score}',
-                                  condition)
+        return self._prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
+                                      proba_drift, not proba_drift)
diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py
@@ -8,24 +8,17 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-"""Module contains Label Drift check."""
-
-from typing import Dict
 
-import pandas as pd
+"""Module contains Label Drift check."""
 
-from deepchecks.core import CheckResult, ConditionResult
-from deepchecks.core.condition import ConditionCategory
-from deepchecks.core.errors import DeepchecksValueError
+from deepchecks.core import CheckResult
 from deepchecks.nlp import Context, TrainTestCheck
-from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,
-                                                 calc_drift_and_plot, get_drift_plot_sidenote)
-from deepchecks.utils.strings import format_number
+from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract
 
 __all__ = ['LabelDrift']
 
 
-class LabelDrift(TrainTestCheck):
+class LabelDrift(TrainTestCheck, LabelDriftAbstract):
     """
     Calculate label drift between train dataset and test dataset, using statistical measures.
 
@@ -41,15 +34,17 @@ class LabelDrift(TrainTestCheck):
     small number of samples (common practice is categories with less than 5 samples).
     However, in cases of a variable with many categories with few samples, it is still recommended to use Cramer's V.
 
+    **Note:** In case of highly imbalanced classes, it is recommended to use Cramer's V, together with setting
+    the ``balance_classes`` parameter to ``True``.
 
     Parameters
     ----------
     min_category_size_ratio: float, default 0.01
         minimum size ratio for categories. Categories with size ratio lower than this number are binned
-        into an "Other" category.
+        into an "Other" category. Ignored if balance_classes=True.
     max_num_categories_for_drift: int, default: None
-        Max number of allowed categories. If there are more,
-        they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots
+        Only for classification. Max number of allowed categories. If there are more,
+        they are binned into an "Other" category.
     max_num_categories_for_display: int, default: 10
         Max number of categories to show in plot.
     show_categories_by: str, default: 'largest_difference'
@@ -58,23 +53,24 @@ class LabelDrift(TrainTestCheck):
         - 'train_largest': Show the largest train categories.
         - 'test_largest': Show the largest test categories.
         - 'largest_difference': Show the largest difference between categories.
-    numerical_drift_method: str, default: "KS"
-        decides which method to use on numerical variables. Possible values are:
-        "EMD" for Earth Mover's Distance (EMD), "KS" for Kolmogorov-Smirnov (KS).
     categorical_drift_method: str, default: "cramers_v"
         decides which method to use on categorical variables. Possible values are:
         "cramers_v" for Cramer's V, "PSI" for Population Stability Index (PSI).
     balance_classes: bool, default: False
         If True, all categories will have an equal weight in the Cramer's V score. This is useful when the categorical
         variable is highly imbalanced, and we want to be alerted on changes in proportion to the category size,
-        and not only to the entire dataset. Must have categorical_drift_method = "cramers_v" and
-        drift_mode = "auto" or "prediction".
+        and not only to the entire dataset. Must have categorical_drift_method = "cramers_v".
         If True, the variable frequency plot will be created with a log scale in the y-axis.
-    ignore_na: bool, default True
+    ignore_na: bool, default False
         For categorical columns only. If True, ignores nones for categorical drift. If False, considers none as a
         separate category. For numerical columns we always ignore nones.
+    min_samples : int , default: 10
+        Minimum number of samples required to calculate the drift score. If there are not enough samples for either
+        train or test, the check will raise a ``NotEnoughSamplesError`` exception.
     n_samples : int , default: 100_000
         Number of samples to use for drift computation and plot.
+    random_state : int , default: 42
+        Random seed for sampling.
     """
 
     def __init__(
@@ -83,104 +79,39 @@ def __init__(
             min_category_size_ratio: float = 0.01,
             max_num_categories_for_display: int = 10,
             show_categories_by: str = 'largest_difference',
-            numerical_drift_method: str = 'KS',
             categorical_drift_method: str = 'cramers_v',
             balance_classes: bool = False,
-            ignore_na: bool = True,
+            ignore_na: bool = False,
+            min_samples: int = 10,
             n_samples: int = 100_000,
+            random_state: int = 42,
             **kwargs
     ):
-        if show_categories_by not in ('train_largest', 'test_largest', 'largest_difference'):
-            raise DeepchecksValueError(
-                'show_categories_by must be one of "train_largest", "test_largest", "largest_difference"')
         super().__init__(**kwargs)
+        # self.margin_quantile_filter = margin_quantile_filter
         self.max_num_categories_for_drift = max_num_categories_for_drift
         self.min_category_size_ratio = min_category_size_ratio
         self.max_num_categories_for_display = max_num_categories_for_display
         self.show_categories_by = show_categories_by
-        self.numerical_drift_method = numerical_drift_method
+        # self.numerical_drift_method = numerical_drift_method
         self.categorical_drift_method = categorical_drift_method
         self.balance_classes = balance_classes
         self.ignore_na = ignore_na
+        self.min_samples = min_samples
         self.n_samples = n_samples
+        self.random_state = random_state
 
     def run_logic(self, context: Context) -> CheckResult:
-        """Calculate drift for the label.
+        """Calculate drift for all columns.
 
         Returns
         -------
         CheckResult
             value: drift score.
             display: label distribution graph, comparing the train and test distributions.
         """
-        context.raise_if_token_classification_task(self)
-        context.raise_if_multi_label_task(self)
-
-        train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
-        test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)
-
-        drift_score, method, display = calc_drift_and_plot(
-            train_column=pd.Series(train_dataset.label),
-            test_column=pd.Series(test_dataset.label),
-            value_name='Label',
-            column_type='categorical',
-            max_num_categories_for_drift=self.max_num_categories_for_drift,
-            min_category_size_ratio=self.min_category_size_ratio,
-            max_num_categories_for_display=self.max_num_categories_for_display,
-            show_categories_by=self.show_categories_by,
-            numerical_drift_method=self.numerical_drift_method,
-            categorical_drift_method=self.categorical_drift_method,
-            balance_classes=self.balance_classes,
-            ignore_na=self.ignore_na,
-            with_display=context.with_display,
-            dataset_names=(train_dataset.name, test_dataset.name)
-        )
-
-        values_dict = {'Drift score': drift_score, 'Method': method}
-
-        if context.with_display:
-            displays = ["""<span>
-                The Drift score is a measure for the difference between two distributions, in this check - the test
-                and train distributions.<br> The check shows the drift score and distributions for the label.
-            </span>""", get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display]
-        else:
-            displays = None
-
-        return CheckResult(value=values_dict, display=displays, header='Train Test Label Drift')
-
-    def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
-                                            max_allowed_numeric_score: float = 0.15):
-        """
-        Add condition - require drift score to be less than the threshold.
-
-        The industry standard for PSI limit is above 0.2.
-        There are no common industry standards for other drift methods, such as Cramer's V,
-        Kolmogorov-Smirnov and Earth Mover's Distance.
-        The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
-        drift.
-
-        Parameters
-        ----------
-        max_allowed_categorical_score: float , default: 0.2
-            the max threshold for the categorical variable drift score
-        max_allowed_numeric_score: float , default: 0.15
-            the max threshold for the numeric variable drift score
-        Returns
-        -------
-        ConditionResult
-            False if any column has passed the max threshold, True otherwise
-        """
-
-        def condition(result: Dict) -> ConditionResult:
-            drift_score = result['Drift score']
-            method = result['Method']
-            has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
-                         (drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
-            details = f'Label\'s drift score {method} is {format_number(drift_score)}'
-            category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS
-            return ConditionResult(category, details)
+        train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
+        test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)
 
-        return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
-                                  f'numerical drift score < {max_allowed_numeric_score} for label drift',
-                                  condition)
+        return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label',
+                                           'categorical', context.with_display, (train_dataset.name, test_dataset.name))
diff --git a/deepchecks/tabular/checks/model_evaluation/prediction_drift.py b/deepchecks/tabular/checks/model_evaluation/prediction_drift.py
@@ -15,14 +15,12 @@
 
 import numpy as np
 
-from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.core import CheckResult
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.core.reduce_classes import ReduceMixin
 from deepchecks.tabular import Context, TrainTestCheck
 from deepchecks.tabular.utils.task_type import TaskType
 from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
-from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
-from deepchecks.utils.strings import format_number
 
 __all__ = ['PredictionDrift']
 
@@ -193,8 +191,8 @@ def run_logic(self, context: Context) -> CheckResult:
             train_pred = np.array(model.predict(train_dataset.features_columns)).reshape((-1, 1))
             test_pred = np.array(model.predict(test_dataset.features_columns)).reshape((-1, 1))
 
-        return self.prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
-                                     (context.task_type != TaskType.REGRESSION) and (not proba_drift))
+        return self._prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
+                                      (context.task_type != TaskType.REGRESSION) and (not proba_drift))
 
     def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
         """Return prediction drift score."""
@@ -217,51 +215,3 @@ def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
     def greater_is_better(self):
         """Return True if the check reduce_output is better when it is greater."""
         return False
-
-    def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
-                                            max_allowed_numeric_score: float = 0.15):
-        """
-        Add condition - require drift score to be less than a certain threshold.
-
-        The industry standard for PSI limit is above 0.2.
-        There are no common industry standards for other drift methods, such as Cramer's V,
-        Kolmogorov-Smirnov and Earth Mover's Distance.
-        The threshold was lowered by 25% compared to feature drift defaults due to the higher importance of prediction
-        drift.
-
-        Parameters
-        ----------
-        max_allowed_categorical_score: float , default: 0.15
-            the max threshold for the categorical variable drift score
-        max_allowed_numeric_score: float ,  default: 0.15
-            the max threshold for the numeric variable drift score
-        Returns
-        -------
-        ConditionResult
-            False if any column has passed the max threshold, True otherwise
-        """
-
-        def condition(result: t.Dict) -> ConditionResult:
-            drift_score_dict = result['Drift score']
-            # Move to dict for easier looping
-            if not isinstance(drift_score_dict, dict):
-                drift_score_dict = {0: drift_score_dict}
-            method = result['Method']
-            has_failed = {}
-            drift_score = 0
-            for class_name, drift_score in drift_score_dict.items():
-                has_failed[class_name] = \
-                    (drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
-                    (drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)
-
-            if len(has_failed) == 1:
-                details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
-            else:
-                details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
-                          f' score above threshold: {max_allowed_numeric_score}.'
-            category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
-            return ConditionResult(category, details)
-
-        return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
-                                  f'numerical drift score < {max_allowed_numeric_score}',
-                                  condition)