Skip to content

Commit

Permalink
New categorical drift method 1138 (#1288)
Browse files Browse the repository at this point in the history
* Separated plotting max_num_categories from drift max_num_categories
Added parameter show_categories_by
added to train_test_label_drift checks to see that it works

* fixed pylint

* Added the new params to all relevant checks

* Added new code to some test

* fixed pylint

* Added deprecation warnings

* Fixed bug

* Fixed pylint

* pylint last fix

* Removed redundany code

* Changed "percentage" to "frequency" in drift plots
  • Loading branch information
nirhutnik committed Apr 21, 2022
1 parent 06fb730 commit 0e0827c
Show file tree
Hide file tree
Showing 17 changed files with 389 additions and 99 deletions.
1 change: 1 addition & 0 deletions deepchecks/tabular/base_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from functools import wraps
from typing import Union, Mapping, List, Any

from deepchecks.tabular import deprecation_warnings # pylint: disable=unused-import # noqa: F401
from deepchecks.tabular.dataset import Dataset
from deepchecks.tabular.context import Context
from deepchecks.core.check_result import (
Expand Down
53 changes: 39 additions & 14 deletions deepchecks/tabular/checks/distribution/train_test_feature_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from collections import OrderedDict
from typing import Union, List, Dict
import warnings

from deepchecks.core import ConditionResult, CheckResult
from deepchecks.core.condition import ConditionCategory
Expand Down Expand Up @@ -49,31 +50,53 @@ class TrainTestFeatureDrift(TrainTestCheck):
sort_feature_by : str , default: feature importance
Indicates how features will be sorted. Can be either "feature importance"
or "drift score"
max_num_categories : int , default: 10
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
for both drift calculation and for distribution plots.
they are binned into an "Other" category. If None, there is no limit.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'train_largest'
Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
by max_num_categories_for_display. Possible values:
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
n_samples : int , default: 100_000
Number of samples to use for drift computation and plot.
random_state : int , default: 42
Random seed for sampling.
max_num_categories: int, default: None
Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
"""

def __init__(
self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 5,
sort_feature_by: str = 'feature importance',
max_num_categories: int = 10,
n_samples: int = 100_000,
random_state: int = 42,
**kwargs
self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 5,
sort_feature_by: str = 'feature importance',
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
n_samples: int = 100_000,
random_state: int = 42,
max_num_categories: int = None, # Deprecated
**kwargs
):
super().__init__(**kwargs)
self.columns = columns
self.ignore_columns = ignore_columns
self.max_num_categories = max_num_categories
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
'and max_num_categories_for_display instead',
DeprecationWarning
)
max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
max_num_categories_for_display = max_num_categories_for_display or max_num_categories
self.max_num_categories_for_drift = max_num_categories_for_drift
self.max_num_categories_for_display = max_num_categories_for_display
self.show_categories_by = show_categories_by
if sort_feature_by in {'feature importance', 'drift score'}:
self.sort_feature_by = sort_feature_by
else:
Expand Down Expand Up @@ -137,7 +160,9 @@ def run_logic(self, context: Context) -> CheckResult:
value_name=column,
column_type=column_type,
plot_title=plot_title,
max_num_categories=self.max_num_categories
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
)
values_dict[column] = {
'Drift score': value,
Expand Down
43 changes: 34 additions & 9 deletions deepchecks/tabular/checks/distribution/train_test_label_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Module contains Train Test label Drift check."""

import warnings
from typing import Dict

from deepchecks.core.condition import ConditionCategory
Expand All @@ -35,19 +35,41 @@ class TrainTestLabelDrift(TrainTestCheck):
Parameters
----------
max_num_categories : int , default: 10
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
for both drift calculation and for distribution plots.
they are binned into an "Other" category. If None, there is no limit.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'train_largest'
Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
by max_num_categories_for_display. Possible values:
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
max_num_categories: int, default: None
Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
"""

def __init__(
self,
max_num_categories: int = 10,
**kwargs
self,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
max_num_categories: int = None,
**kwargs
):
super().__init__(**kwargs)
self.max_num_categories = max_num_categories
if max_num_categories is not None:
warnings.warn(
'max_num_categories is deprecated. please use max_num_categories_for_drift and '
'max_num_categories_for_display instead',
DeprecationWarning
)
max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
max_num_categories_for_display = max_num_categories_for_display or max_num_categories
self.max_num_categories_for_drift = max_num_categories_for_drift
self.max_num_categories_for_display = max_num_categories_for_display
self.show_categories_by = show_categories_by

def run_logic(self, context: Context) -> CheckResult:
"""Calculate drift for all columns.
Expand All @@ -66,7 +88,10 @@ def run_logic(self, context: Context) -> CheckResult:
test_column=test_dataset.label_col,
value_name=train_dataset.label_name,
column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
max_num_categories=self.max_num_categories
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by

)

headnote = """<span>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"""Module contains Train Test label Drift check."""

from typing import Dict

import warnings
import pandas as pd

from deepchecks import ConditionCategory
Expand All @@ -37,19 +37,41 @@ class TrainTestPredictionDrift(TrainTestCheck):
Parameters
----------
max_num_categories : int , default: 10
max_num_categories_for_drift: int, default: 10
Only for categorical columns. Max number of allowed categories. If there are more,
they are binned into an "Other" category. If max_num_categories=None, there is no limit. This limit applies
for both drift calculation and for distribution plots.
they are binned into an "Other" category. If None, there is no limit.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'train_largest'
Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
by max_num_categories_for_display. Possible values:
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
max_num_categories: int, default: None
Deprecated. Please use max_num_categories_for_drift and max_num_categories_for_display instead
"""

def __init__(
self,
max_num_categories: int = 10,
**kwargs
self,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
max_num_categories: int = None, # Deprecated
**kwargs
):
super().__init__(**kwargs)
self.max_num_categories = max_num_categories
if max_num_categories is not None:
warnings.warn(
f'{self.__class__.__name__}: max_num_categories is deprecated. please use max_num_categories_for_drift '
'and max_num_categories_for_display instead',
DeprecationWarning
)
max_num_categories_for_drift = max_num_categories_for_drift or max_num_categories
max_num_categories_for_display = max_num_categories_for_display or max_num_categories
self.max_num_categories_for_drift = max_num_categories_for_drift
self.max_num_categories_for_display = max_num_categories_for_display
self.show_categories_by = show_categories_by

def run_logic(self, context: Context) -> CheckResult:
"""Calculate drift for all columns.
Expand All @@ -72,7 +94,9 @@ def run_logic(self, context: Context) -> CheckResult:
test_column=pd.Series(test_prediction),
value_name='model predictions',
column_type='categorical' if train_dataset.label_type == 'classification_label' else 'numerical',
max_num_categories=self.max_num_categories
max_num_categories_for_drift=self.max_num_categories_for_drift,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by
)

headnote = """<span>
Expand Down
20 changes: 20 additions & 0 deletions deepchecks/tabular/deprecation_warnings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""This file changes default 'ignore' action of DeprecationWarnings for specific deprecation messages."""
import warnings

# Added in version 0.6.2, deprecates max_num_categories in all drift checks
warnings.filterwarnings(
action='always',
message=r'.*max_num_categories.*',
category=DeprecationWarning,
module=r'deepchecks.*'
)
68 changes: 53 additions & 15 deletions deepchecks/utils/distribution/drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from deepchecks.utils.distribution.plot import drift_score_bar_traces, feature_distribution_traces
from deepchecks.utils.distribution.preprocessing import preprocess_2_cat_cols_to_same_bins
from deepchecks.core.errors import DeepchecksValueError, NotEnoughSamplesError
from deepchecks.utils.strings import format_percent

PSI_MIN_PERCENTAGE = 0.01


__all__ = ['calc_drift_and_plot']


Expand Down Expand Up @@ -100,26 +100,36 @@ def calc_drift_and_plot(train_column: pd.Series,
value_name: Hashable,
column_type: str,
plot_title: Optional[str] = None,
max_num_categories: int = 10,
max_num_categories_for_drift: int = 10,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'train_largest',
min_samples: int = 10) -> Tuple[float, str, Callable]:
"""
Calculate drift score per column.
Parameters
----------
train_column : pd.Series
train_column: pd.Series
column from train dataset
test_column : pd.Series
test_column: pd.Series
same column from test dataset
value_name : Hashable
value_name: Hashable
title of the x axis, if plot_title is None then also the title of the whole plot.
column_type : str
column_type: str
type of column (either "numerical" or "categorical")
plot_title : str or None
plot_title: str or None
if None use value_name as title otherwise use this.
max_num_categories : int , default: 10
max_num_categories_for_drift: int, default: 10
Max number of allowed categories. If there are more, they are binned into an "Other" category.
min_samples : int, default: 10
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'train_largest'
Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
by max_num_categories_for_display. Possible values:
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
min_samples: int, default: 10
Minimum number of samples for each column in order to calculate draft
Returns
-------
Expand Down Expand Up @@ -147,25 +157,53 @@ def calc_drift_and_plot(train_column: pd.Series,
dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(train_dist, test_dist, value_name)
elif column_type == 'categorical':
scorer_name = 'PSI'
expected_percents, actual_percents, _ = \
preprocess_2_cat_cols_to_same_bins(dist1=train_dist, dist2=test_dist, max_num_categories=max_num_categories)
expected, actual, _ = \
preprocess_2_cat_cols_to_same_bins(dist1=train_column, dist2=test_column,
max_num_categories=max_num_categories_for_drift)
expected_percents, actual_percents = expected / len(train_column), actual / len(test_column)
score = psi(expected_percents=expected_percents, actual_percents=actual_percents)

bar_traces, bar_x_axis, bar_y_axis = drift_score_bar_traces(score, bar_max=1)
dist_traces, dist_x_axis, dist_y_axis = feature_distribution_traces(
train_dist, test_dist, value_name, is_categorical=True, max_num_categories=max_num_categories
train_dist, test_dist, value_name, is_categorical=True, max_num_categories=max_num_categories_for_display,
show_categories_by=show_categories_by
)
else:
# Should never reach here
raise DeepchecksValueError(f'Unsupported column type for drift: {column_type}')

fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
row_heights=[0.1, 0.9],
subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])
all_categories = list(set(train_column).union(set(test_column)))
add_footnote = column_type == 'categorical' and len(all_categories) > max_num_categories_for_drift

if not add_footnote:
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
row_heights=[0.1, 0.9],
subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])
else:
fig = make_subplots(rows=3, cols=1, vertical_spacing=0.2, shared_yaxes=False, shared_xaxes=False,
row_heights=[0.1, 0.8, 0.1],
subplot_titles=[f'Drift Score ({scorer_name})', 'Distribution Plot'])

fig.add_traces(bar_traces, rows=[1] * len(bar_traces), cols=[1] * len(bar_traces))
fig.add_traces(dist_traces, rows=[2] * len(dist_traces), cols=[1] * len(dist_traces))

if add_footnote:
param_to_print_dict = {
'train_largest': 'largest categories (by train)',
'test_largest': 'largest categories (by test)',
'largest_difference': 'largest difference between categories'
}
train_data_percents = dist_traces[0].y.sum()
test_data_percents = dist_traces[1].y.sum()

fig.add_annotation(
x=0, y=-0.2, showarrow=False, xref='paper', yref='paper', xanchor='left',
text=f'* Showing the top {max_num_categories_for_drift} {param_to_print_dict[show_categories_by]} out of '
f'total {len(all_categories)} categories.'
f'<br>Shown data is {format_percent(train_data_percents)} of train data and '
f'{format_percent(test_data_percents)} of test data.'
)

if not plot_title:
plot_title = value_name

Expand Down

0 comments on commit 0e0827c

Please sign in to comment.