New categorical drift method 1138 (#1288)

* Separated plotting max_num_categories from drift max_num_categories Added parameter show_categories_by added to train_test_label_drift checks to see that it works * fixed pylint * Added the new params to all relevant checks * Added new code to some test * fixed pylint * Added deprecation warnings * Fixed bug * Fixed pylint * pylint last fix * Removed redundany code * Changed "percentage" to "frequency" in drift plots
deepchecks · Apr 21, 2022 · 0e0827c · 0e0827c
1 parent 06fb730
commit 0e0827c
Show file tree

Hide file tree

Showing 17 changed files with 389 additions and 99 deletions.
diff --git a/deepchecks/tabular/base_checks.py b/deepchecks/tabular/base_checks.py
@@ -13,6 +13,7 @@
 from functools import wraps
 from typing import Union, Mapping, List, Any
 
+from deepchecks.tabular import deprecation_warnings  # pylint: disable=unused-import # noqa: F401
 from deepchecks.tabular.dataset import Dataset
 from deepchecks.tabular.context import Context
 from deepchecks.core.check_result import (

diff --git a/deepchecks/tabular/checks/distribution/train_test_feature_drift.py b/deepchecks/tabular/checks/distribution/train_test_feature_drift.py
@@ -12,6 +12,7 @@
 
 from collections import OrderedDict
 from typing import Union, List, Dict
+import warnings
 
 from deepchecks.core import ConditionResult, CheckResult
 from deepchecks.core.condition import ConditionCategory
@@ -49,31 +50,53 @@ class TrainTestFeatureDrift(TrainTestCheck):
     sort_feature_by : str , default: feature importance
         Indicates how features will be sorted. Can be either "feature importance"
         or "drift score"
-    max_num_categories : int , default: 10
+    max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
-        they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
-        for both drift calculation and for distribution plots.
+        they are binned into an "Other" category. If None, there is no limit.
+    max_num_categories_for_display: int, default: 10
+        Max number of categories to show in plot.
+    show_categories_by: str, default: 'train_largest'
+        Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
+        by max_num_categories_for_display. Possible values:
+        - 'train_largest': Show the largest train categories.
+        - 'test_largest': Show the largest test categories.
+        - 'largest_difference': Show the largest difference between categories.
     n_samples : int , default: 100_000
         Number of samples to use for drift computation and plot.
     random_state : int , default: 42
         Random seed for sampling.
+    max_num_categories: int, default: None
+        Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
     """
 
     def __init__(
-        self,
-        columns: Union[Hashable, List[Hashable], None] = None,
-        ignore_columns: Union[Hashable, List[Hashable], None] = None,
-        n_top_columns: int = 5,
-        sort_feature_by: str = 'feature importance',
-        max_num_categories: int = 10,
-        n_samples: int = 100_000,
-        random_state: int = 42,
-        **kwargs
+            self,
+            columns: Union[Hashable, List[Hashable], None] = None,
+            ignore_columns: Union[Hashable, List[Hashable], None] = None,
+            n_top_columns: int = 5,
+            sort_feature_by: str = 'feature importance',
+            max_num_categories_for_drift: int = 10,
+            max_num_categories_for_display: int = 10,
+            show_categories_by: str = 'train_largest',
+            n_samples: int = 100_000,
+            random_state: int = 42,
+            max_num_categories: int = None,  # Deprecated
+            **kwargs
     ):
         super().__init__(**kwargs)
         self.columns = columns
         self.ignore_columns = ignore_columns
-        self.max_num_categories = max_num_categories
+        if max_num_categories is not None:
+            warnings.warn(
+                f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
+                'and max_num_categories_for_display instead',
+                DeprecationWarning
+            )
+            max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
+            max_num_categories_for_display = max_num_categories_for_display or max_num_categories
+        self.max_num_categories_for_drift = max_num_categories_for_drift
+        self.max_num_categories_for_display = max_num_categories_for_display
+        self.show_categories_by = show_categories_by
         if sort_feature_by in {'feature importance', 'drift score'}:
             self.sort_feature_by = sort_feature_by
         else:
@@ -137,7 +160,9 @@ def run_logic(self, context: Context) -> CheckResult:
                 value_name=column,
                 column_type=column_type,
                 plot_title=plot_title,
-                max_num_categories=self.max_num_categories
+                max_num_categories_for_drift=self.max_num_categories_for_drift,
+                max_num_categories_for_display=self.max_num_categories_for_display,
+                show_categories_by=self.show_categories_by
             )
             values_dict[column] = {
                 'Drift score': value,

diff --git a/deepchecks/tabular/checks/distribution/train_test_label_drift.py b/deepchecks/tabular/checks/distribution/train_test_label_drift.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module contains Train Test label Drift check."""
-
+import warnings
 from typing import Dict
 
 from deepchecks.core.condition import ConditionCategory
@@ -35,19 +35,41 @@ class TrainTestLabelDrift(TrainTestCheck):
 
     Parameters
     ----------
-    max_num_categories : int , default: 10
+    max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
-        they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
-        for both drift calculation and for distribution plots.
+        they are binned into an "Other" category. If None, there is no limit.
+    max_num_categories_for_display: int, default: 10
+        Max number of categories to show in plot.
+    show_categories_by: str, default: 'train_largest'
+        Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
+        by max_num_categories_for_display. Possible values:
+        - 'train_largest': Show the largest train categories.
+        - 'test_largest': Show the largest test categories.
+        - 'largest_difference': Show the largest difference between categories.
+    max_num_categories: int, default: None
+        Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
     """
 
     def __init__(
-        self,
-        max_num_categories: int = 10,
-        **kwargs
+            self,
+            max_num_categories_for_drift: int = 10,
+            max_num_categories_for_display: int = 10,
+            show_categories_by: str = 'train_largest',
+            max_num_categories: int = None,
+            **kwargs
     ):
         super().__init__(**kwargs)
-        self.max_num_categories = max_num_categories
+        if max_num_categories is not None:
+            warnings.warn(
+                'max_num_categories is deprecated. please use max_num_categories_for_drift and '
+                'max_num_categories_for_display instead',
+                DeprecationWarning
+            )
+            max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
+            max_num_categories_for_display = max_num_categories_for_display or max_num_categories
+        self.max_num_categories_for_drift = max_num_categories_for_drift
+        self.max_num_categories_for_display = max_num_categories_for_display
+        self.show_categories_by = show_categories_by
 
     def run_logic(self, context: Context) -> CheckResult:
         """Calculate drift for all columns.
@@ -66,7 +88,10 @@ def run_logic(self, context: Context) -> CheckResult:
             test_column=test_dataset.label_col,
             value_name=train_dataset.label_name,
             column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
-            max_num_categories=self.max_num_categories
+            max_num_categories_for_drift=self.max_num_categories_for_drift,
+            max_num_categories_for_display=self.max_num_categories_for_display,
+            show_categories_by=self.show_categories_by
+
         )
 
         headnote = """<span>

diff --git a/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py b/deepchecks/tabular/checks/distribution/train_test_prediction_drift.py
@@ -11,7 +11,7 @@
 """Module contains Train Test label Drift check."""
 
 from typing import Dict
-
+import warnings
 import pandas as pd
 
 from deepchecks import ConditionCategory
@@ -37,19 +37,41 @@ class TrainTestPredictionDrift(TrainTestCheck):
 
     Parameters
     ----------
-    max_num_categories : int , default: 10
+    max_num_categories_for_drift: int, default: 10
         Only for categorical columns. Max number of allowed categories. If there are more,
-        they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
-        for both drift calculation and for distribution plots.
+        they are binned into an "Other" category. If None, there is no limit.
+    max_num_categories_for_display: int, default: 10
+        Max number of categories to show in plot.
+    show_categories_by: str, default: 'train_largest'
+        Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
+        by max_num_categories_for_display. Possible values:
+        - 'train_largest': Show the largest train categories.
+        - 'test_largest': Show the largest test categories.
+        - 'largest_difference': Show the largest difference between categories.
+    max_num_categories: int, default: None
+        Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
     """
 
     def __init__(
-        self,
-        max_num_categories: int = 10,
-        **kwargs
+            self,
+            max_num_categories_for_drift: int = 10,
+            max_num_categories_for_display: int = 10,
+            show_categories_by: str = 'train_largest',
+            max_num_categories: int = None,  # Deprecated
+            **kwargs
     ):
         super().__init__(**kwargs)
-        self.max_num_categories = max_num_categories
+        if max_num_categories is not None:
+            warnings.warn(
+                f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
+                'and max_num_categories_for_display instead',
+                DeprecationWarning
+            )
+            max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
+            max_num_categories_for_display = max_num_categories_for_display or max_num_categories
+        self.max_num_categories_for_drift = max_num_categories_for_drift
+        self.max_num_categories_for_display = max_num_categories_for_display
+        self.show_categories_by = show_categories_by
 
     def run_logic(self, context: Context) -> CheckResult:
         """Calculate drift for all columns.
@@ -72,7 +94,9 @@ def run_logic(self, context: Context) -> CheckResult:
             test_column=pd.Series(test_prediction),
             value_name='model predictions',
             column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
-            max_num_categories=self.max_num_categories
+            max_num_categories_for_drift=self.max_num_categories_for_drift,
+            max_num_categories_for_display=self.max_num_categories_for_display,
+            show_categories_by=self.show_categories_by
         )
 
         headnote = """<span>

diff --git a/deepchecks/tabular/deprecation_warnings.py b/deepchecks/tabular/deprecation_warnings.py
@@ -0,0 +1,20 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""This file changes default 'ignore' action of DeprecationWarnings for specific deprecation messages."""
+import warnings
+
+# Added in version 0.6.2, deprecates max_num_categories in all drift checks
+warnings.filterwarnings(
+    action='always',
+    message=r'.*max_num_categories.*',
+    category=DeprecationWarning,
+    module=r'deepchecks.*'
+)
diff --git a/deepchecks/utils/distribution/drift.py b/deepchecks/utils/distribution/drift.py
@@ -22,10 +22,10 @@
 from deepchecks.utils.distribution.plot import drift_score_bar_traces, feature_distribution_traces
 from deepchecks.utils.distribution.preprocessing import preprocess_2_cat_cols_to_same_bins
 from deepchecks.core.errors import DeepchecksValueError, NotEnoughSamplesError
+from deepchecks.utils.strings import format_percent
 
 PSI_MIN_PERCENTAGE = 0.01
 
-
 __all__ = ['calc_drift_and_plot']
 
 
@@ -100,26 +100,36 @@ def calc_drift_and_plot(train_column: pd.Series,
                         value_name: Hashable,
                         column_type: str,
                         plot_title: Optional[str] = None,
-                        max_num_categories: int = 10,
+                        max_num_categories_for_drift: int = 10,
+                        max_num_categories_for_display: int = 10,
+                        show_categories_by: str = 'train_largest',
                         min_samples: int = 10) -> Tuple[float, str, Callable]:
     """
     Calculate drift score per column.
 
     Parameters
     ----------
-    train_column : pd.Series
+    train_column: pd.Series
         column from train dataset
-    test_column : pd.Series
+    test_column: pd.Series
         same column from test dataset
-    value_name : Hashable
+    value_name: Hashable
         title of the x axis, if plot_title is None then also the title of the whole plot.
-    column_type : str
+    column_type: str
         type of column (either "numerical" or "categorical")
-    plot_title : str or None
+    plot_title: str or None
         if None use value_name as title otherwise use this.
-    max_num_categories : int , default: 10
+    max_num_categories_for_drift: int, default: 10
         Max number of allowed categories. If there are more, they are binned into an "Other" category.
-    min_samples : int, default: 10
+    max_num_categories_for_display: int, default: 10
+        Max number of categories to show in plot.
+    show_categories_by: str, default: 'train_largest'
+        Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
+        by max_num_categories_for_display. Possible values:
+        - 'train_largest': Show the largest train categories.
+        - 'test_largest': Show the largest test categories.
+        - 'largest_difference': Show the largest difference between categories.
+    min_samples: int, default: 10
         Minimum number of samples for each column in order to calculate draft
     Returns
     -------
@@ -147,25 +157,53 @@ def calc_drift_and_plot(train_column: pd.Series,
         dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(train_dist, test_dist, value_name)
     elif column_type == 'categorical':
         scorer_name = 'PSI'
-        expected_percents, actual_percents, _ = \
-            preprocess_2_cat_cols_to_same_bins(dist1=train_dist, dist2=test_dist, max_num_categories=max_num_categories)
+        expected, actual, _ = \
+            preprocess_2_cat_cols_to_same_bins(dist1=train_column, dist2=test_column,
+                                               max_num_categories=max_num_categories_for_drift)
+        expected_percents, actual_percents = expected / len(train_column), actual / len(test_column)
         score = psi(expected_percents=expected_percents, actual_percents=actual_percents)
 
         bar_traces, bar_x_axis, bar_y_axis = drift_score_bar_traces(score, bar_max=1)
         dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(
-            train_dist, test_dist, value_name, is_categorical=True, max_num_categories=max_num_categories
+            train_dist, test_dist, value_name, is_categorical=True, max_num_categories=max_num_categories_for_display,
+            show_categories_by=show_categories_by
         )
     else:
         # Should never reach here
         raise DeepchecksValueError(f'Unsupported column type for drift: {column_type}')
 
-    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
-                        row_heights=[0.1, 0.9],
-                        subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])
+    all_categories = list(set(train_column).union(set(test_column)))
+    add_footnote = column_type == 'categorical' and len(all_categories) > max_num_categories_for_drift
+
+    if not add_footnote:
+        fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
+                            row_heights=[0.1, 0.9],
+                            subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])
+    else:
+        fig = make_subplots(rows=3, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
+                            row_heights=[0.1, 0.8, 0.1],
+                            subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])
 
     fig.add_traces(bar_traces, rows=[1] * len(bar_traces), cols=[1] * len(bar_traces))
     fig.add_traces(dist_traces, rows=[2] * len(dist_traces), cols=[1] * len(dist_traces))
 
+    if add_footnote:
+        param_to_print_dict = {
+            'train_largest': 'largest categories (by train)',
+            'test_largest': 'largest categories (by test)',
+            'largest_difference': 'largest difference between categories'
+        }
+        train_data_percents = dist_traces[0].y.sum()
+        test_data_percents = dist_traces[1].y.sum()
+
+        fig.add_annotation(
+            x=0, y=-0.2, showarrow=False, xref='paper', yref='paper', xanchor='left',
+            text=f'* Showing the top {max_num_categories_for_drift} {param_to_print_dict[show_categories_by]} out of '
+                 f'total {len(all_categories)} categories.'
+                 f'<br>Shown data is {format_percent(train_data_percents)} of train data and '
+                 f'{format_percent(test_data_percents)} of test data.'
+        )
+
     if not plot_title:
         plot_title = value_name