Skip to content

Commit

Permalink
Nb/feat/multi label (#2467)
Browse files Browse the repository at this point in the history
Support multi label prediction and label drift + abstraction
  • Loading branch information
Nadav-Barak committed Apr 23, 2023
1 parent 53c8d87 commit 1bec6b5
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 321 deletions.
58 changes: 3 additions & 55 deletions deepchecks/nlp/checks/model_evaluation/prediction_drift.py
Expand Up @@ -10,16 +10,12 @@
#
"""Module contains Prediction Drift check."""

from typing import Dict

import numpy as np

from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.nlp import Context, TrainTestCheck
from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
from deepchecks.utils.strings import format_number

__all__ = ['PredictionDrift']

Expand Down Expand Up @@ -162,53 +158,5 @@ def run_logic(self, context: Context) -> CheckResult:
train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1))
test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1))

return self.prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
proba_drift, not proba_drift)

def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
max_allowed_numeric_score: float = 0.15):
"""
Add condition - require drift score to be less than a certain threshold.
The industry standard for PSI limit is above 0.2.
There are no common industry standards for other drift methods, such as Cramer's V,
Kolmogorov-Smirnov and Earth Mover's Distance.
The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
drift.
Parameters
----------
max_allowed_categorical_score: float , default: 0.15
the max threshold for the categorical variable drift score
max_allowed_numeric_score: float , default: 0.15
the max threshold for the numeric variable drift score
Returns
-------
ConditionResult
False if any distribution has passed the max threshold, True otherwise
"""

def condition(result: Dict) -> ConditionResult:
drift_score_dict = result['Drift score']
# Move to dict for easier looping
if not isinstance(drift_score_dict, dict):
drift_score_dict = {0: drift_score_dict}
method = result['Method']
has_failed = {}
drift_score = 0
for class_name, drift_score in drift_score_dict.items():
has_failed[class_name] = \
(drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
(drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)

if len(has_failed) == 1:
details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
else:
details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
f' score above threshold: {max_allowed_numeric_score}.'
category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
return ConditionResult(category, details)

return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
f'numerical drift score < {max_allowed_numeric_score}',
condition)
return self._prediction_drift(train_prediction, test_prediction, context.model_classes, context.with_display,
proba_drift, not proba_drift)
125 changes: 28 additions & 97 deletions deepchecks/nlp/checks/train_test_validation/label_drift.py
Expand Up @@ -8,24 +8,17 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module contains Label Drift check."""

from typing import Dict

import pandas as pd
"""Module contains Label Drift check."""

from deepchecks.core import CheckResult, ConditionResult
from deepchecks.core.condition import ConditionCategory
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.core import CheckResult
from deepchecks.nlp import Context, TrainTestCheck
from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,
calc_drift_and_plot, get_drift_plot_sidenote)
from deepchecks.utils.strings import format_number
from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract

__all__ = ['LabelDrift']


class LabelDrift(TrainTestCheck):
class LabelDrift(TrainTestCheck, LabelDriftAbstract):
"""
Calculate label drift between train dataset and test dataset, using statistical measures.
Expand All @@ -41,15 +34,17 @@ class LabelDrift(TrainTestCheck):
small number of samples (common practice is categories with less than 5 samples).
However, in cases of a variable with many categories with few samples, it is still recommended to use Cramer's V.
**Note:** In case of highly imbalanced classes, it is recommended to use Cramer's V, together with setting
the ``balance_classes`` parameter to ``True``.
Parameters
----------
min_category_size_ratio: float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
into an "Other" category.
into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift: int, default: None
Max number of allowed categories. If there are more,
they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots
Only for classification. Max number of allowed categories. If there are more,
they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'largest_difference'
Expand All @@ -58,23 +53,24 @@ class LabelDrift(TrainTestCheck):
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
numerical_drift_method: str, default: "KS"
decides which method to use on numerical variables. Possible values are:
"EMD" for Earth Mover's Distance (EMD), "KS" for Kolmogorov-Smirnov (KS).
categorical_drift_method: str, default: "cramers_v"
decides which method to use on categorical variables. Possible values are:
"cramers_v" for Cramer's V, "PSI" for Population Stability Index (PSI).
balance_classes: bool, default: False
If True, all categories will have an equal weight in the Cramer's V score. This is useful when the categorical
variable is highly imbalanced, and we want to be alerted on changes in proportion to the category size,
and not only to the entire dataset. Must have categorical_drift_method = "cramers_v" and
drift_mode = "auto" or "prediction".
and not only to the entire dataset. Must have categorical_drift_method = "cramers_v".
If True, the variable frequency plot will be created with a log scale in the y-axis.
ignore_na: bool, default True
ignore_na: bool, default False
For categorical columns only. If True, ignores nones for categorical drift. If False, considers none as a
separate category. For numerical columns we always ignore nones.
min_samples : int , default: 10
Minimum number of samples required to calculate the drift score. If there are not enough samples for either
train or test, the check will raise a ``NotEnoughSamplesError`` exception.
n_samples : int , default: 100_000
Number of samples to use for drift computation and plot.
random_state : int , default: 42
Random seed for sampling.
"""

def __init__(
Expand All @@ -83,104 +79,39 @@ def __init__(
min_category_size_ratio: float = 0.01,
max_num_categories_for_display: int = 10,
show_categories_by: str = 'largest_difference',
numerical_drift_method: str = 'KS',
categorical_drift_method: str = 'cramers_v',
balance_classes: bool = False,
ignore_na: bool = True,
ignore_na: bool = False,
min_samples: int = 10,
n_samples: int = 100_000,
random_state: int = 42,
**kwargs
):
if show_categories_by not in ('train_largest', 'test_largest', 'largest_difference'):
raise DeepchecksValueError(
'show_categories_by must be one of "train_largest", "test_largest", "largest_difference"')
super().__init__(**kwargs)
# self.margin_quantile_filter = margin_quantile_filter
self.max_num_categories_for_drift = max_num_categories_for_drift
self.min_category_size_ratio = min_category_size_ratio
self.max_num_categories_for_display = max_num_categories_for_display
self.show_categories_by = show_categories_by
self.numerical_drift_method = numerical_drift_method
# self.numerical_drift_method = numerical_drift_method
self.categorical_drift_method = categorical_drift_method
self.balance_classes = balance_classes
self.ignore_na = ignore_na
self.min_samples = min_samples
self.n_samples = n_samples
self.random_state = random_state

def run_logic(self, context: Context) -> CheckResult:
"""Calculate drift for the label.
"""Calculate drift for all columns.
Returns
-------
CheckResult
value: drift score.
display: label distribution graph, comparing the train and test distributions.
"""
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)

drift_score, method, display = calc_drift_and_plot(
train_column=pd.Series(train_dataset.label),
test_column=pd.Series(test_dataset.label),
value_name='Label',
column_type='categorical',
max_num_categories_for_drift=self.max_num_categories_for_drift,
min_category_size_ratio=self.min_category_size_ratio,
max_num_categories_for_display=self.max_num_categories_for_display,
show_categories_by=self.show_categories_by,
numerical_drift_method=self.numerical_drift_method,
categorical_drift_method=self.categorical_drift_method,
balance_classes=self.balance_classes,
ignore_na=self.ignore_na,
with_display=context.with_display,
dataset_names=(train_dataset.name, test_dataset.name)
)

values_dict = {'Drift score': drift_score, 'Method': method}

if context.with_display:
displays = ["""<span>
The Drift score is a measure for the difference between two distributions, in this check - the test
and train distributions.<br> The check shows the drift score and distributions for the label.
</span>""", get_drift_plot_sidenote(self.max_num_categories_for_display, self.show_categories_by), display]
else:
displays = None

return CheckResult(value=values_dict, display=displays, header='Train Test Label Drift')

def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
max_allowed_numeric_score: float = 0.15):
"""
Add condition - require drift score to be less than the threshold.
The industry standard for PSI limit is above 0.2.
There are no common industry standards for other drift methods, such as Cramer's V,
Kolmogorov-Smirnov and Earth Mover's Distance.
The threshold was lowered by 25% compared to property drift defaults due to the higher importance of prediction
drift.
Parameters
----------
max_allowed_categorical_score: float , default: 0.2
the max threshold for the categorical variable drift score
max_allowed_numeric_score: float , default: 0.15
the max threshold for the numeric variable drift score
Returns
-------
ConditionResult
False if any column has passed the max threshold, True otherwise
"""

def condition(result: Dict) -> ConditionResult:
drift_score = result['Drift score']
method = result['Method']
has_failed = (drift_score > max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
(drift_score > max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)

details = f'Label\'s drift score {method} is {format_number(drift_score)}'
category = ConditionCategory.FAIL if has_failed else ConditionCategory.PASS
return ConditionResult(category, details)
train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)

return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
f'numerical drift score < {max_allowed_numeric_score} for label drift',
condition)
return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label',
'categorical', context.with_display, (train_dataset.name, test_dataset.name))
56 changes: 3 additions & 53 deletions deepchecks/tabular/checks/model_evaluation/prediction_drift.py
Expand Up @@ -15,14 +15,12 @@

import numpy as np

from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.core.reduce_classes import ReduceMixin
from deepchecks.tabular import Context, TrainTestCheck
from deepchecks.tabular.utils.task_type import TaskType
from deepchecks.utils.abstracts.prediction_drift import PredictionDriftAbstract
from deepchecks.utils.distribution.drift import SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS
from deepchecks.utils.strings import format_number

__all__ = ['PredictionDrift']

Expand Down Expand Up @@ -193,8 +191,8 @@ def run_logic(self, context: Context) -> CheckResult:
train_pred = np.array(model.predict(train_dataset.features_columns)).reshape((-1, 1))
test_pred = np.array(model.predict(test_dataset.features_columns)).reshape((-1, 1))

return self.prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
(context.task_type != TaskType.REGRESSION) and (not proba_drift))
return self._prediction_drift(train_pred, test_pred, context.model_classes, context.with_display, proba_drift,
(context.task_type != TaskType.REGRESSION) and (not proba_drift))

def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
"""Return prediction drift score."""
Expand All @@ -217,51 +215,3 @@ def reduce_output(self, check_result: CheckResult) -> t.Dict[str, float]:
def greater_is_better(self):
"""Return True if the check reduce_output is better when it is greater."""
return False

def add_condition_drift_score_less_than(self, max_allowed_categorical_score: float = 0.15,
max_allowed_numeric_score: float = 0.15):
"""
Add condition - require drift score to be less than a certain threshold.
The industry standard for PSI limit is above 0.2.
There are no common industry standards for other drift methods, such as Cramer's V,
Kolmogorov-Smirnov and Earth Mover's Distance.
The threshold was lowered by 25% compared to feature drift defaults due to the higher importance of prediction
drift.
Parameters
----------
max_allowed_categorical_score: float , default: 0.15
the max threshold for the categorical variable drift score
max_allowed_numeric_score: float , default: 0.15
the max threshold for the numeric variable drift score
Returns
-------
ConditionResult
False if any column has passed the max threshold, True otherwise
"""

def condition(result: t.Dict) -> ConditionResult:
drift_score_dict = result['Drift score']
# Move to dict for easier looping
if not isinstance(drift_score_dict, dict):
drift_score_dict = {0: drift_score_dict}
method = result['Method']
has_failed = {}
drift_score = 0
for class_name, drift_score in drift_score_dict.items():
has_failed[class_name] = \
(drift_score >= max_allowed_categorical_score and method in SUPPORTED_CATEGORICAL_METHODS) or \
(drift_score >= max_allowed_numeric_score and method in SUPPORTED_NUMERIC_METHODS)

if len(has_failed) == 1:
details = f'Found model prediction {method} drift score of {format_number(drift_score)}'
else:
details = f'Found {sum(has_failed.values())} classes with model predicted probability {method} drift' \
f' score above threshold: {max_allowed_numeric_score}.'
category = ConditionCategory.FAIL if any(has_failed.values()) else ConditionCategory.PASS
return ConditionResult(category, details)

return self.add_condition(f'categorical drift score < {max_allowed_categorical_score} and '
f'numerical drift score < {max_allowed_numeric_score}',
condition)

0 comments on commit 1bec6b5

Please sign in to comment.