Skip to content

Commit

Permalink
Improve sfc 1399 (#1478)
Browse files Browse the repository at this point in the history
* Added parameter in condition of single_feature_contribution_train_test.py

* Added parameter to diff condition in simple feature contribution check and tests that were missing

* Changed plots for single feature contribution train test and simple feature contribution.
Removed purple line + added percentage of change in parenthesis (and made bold)
Also added parameter to format_percent to sometimes show '+' as prefix

* Fixed embarrassing mistake

* isort

* Fixed missing minus signs
  • Loading branch information
nirhutnik committed May 22, 2022
1 parent 0a7d474 commit 3942d0e
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 47 deletions.
52 changes: 30 additions & 22 deletions deepchecks/core/check_utils/single_feature_contribution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import deepchecks.ppscore as pps
from deepchecks.utils.plot import colors
from deepchecks.utils.strings import format_percent
from deepchecks.utils.typing import Hashable


Expand All @@ -40,14 +41,28 @@ def get_pps_figure(per_class: bool):
return fig


def pps_df_to_trace(s_pps: pd.Series, name: str):
"""If name is train/test use our defined colors, else will use plotly defaults."""
def pd_series_to_trace(s_pps: pd.Series, name: str):
"""Create bar plotly bar trace out of pandas Series."""
name = name.capitalize() if name else None
return go.Bar(x=s_pps.index,
y=s_pps,
name=name,
marker_color=colors.get(name),
text=s_pps.round(2),
text='<b>' + s_pps.round(2).astype(str) + '</b>',
textposition='outside'
)


def pd_series_to_trace_with_diff(s_pps: pd.Series, name: str, diffs: pd.Series):
"""Create bar plotly bar trace out of pandas Series, with difference shown in percentages."""
diffs_text = '(' + diffs.apply(format_percent, floating_point=0, add_positive_prefix=True) + ')'
text = diffs_text + '<br>' + s_pps.round(2).astype(str)
name = name.capitalize() if name else None
return go.Bar(x=s_pps.index,
y=s_pps,
name=name,
marker_color=colors.get(name),
text='<b>' + text + '</b>',
textposition='outside'
)

Expand Down Expand Up @@ -100,21 +115,14 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
s_pps_test = df_pps_test.set_index('x', drop=True)['ppscore']
s_difference = s_pps_train - s_pps_test

s_difference_to_display = np.abs(s_difference).sort_values(ascending=False).head(n_show_top)

s_pps_train_to_display = s_pps_train[s_difference_to_display.index]
s_pps_test_to_display = s_pps_test[s_difference_to_display.index]
sorted_order_for_display = np.abs(s_difference).sort_values(ascending=False).head(n_show_top).index
s_pps_train_to_display = s_pps_train[sorted_order_for_display]
s_pps_test_to_display = s_pps_test[sorted_order_for_display]
s_difference_to_display = s_difference[sorted_order_for_display]

fig = get_pps_figure(per_class=False)
fig.add_trace(pps_df_to_trace(s_pps_train_to_display, 'train'))
fig.add_trace(pps_df_to_trace(s_pps_test_to_display, 'test'))
fig.add_trace(go.Scatter(x=s_difference_to_display.index,
y=s_difference_to_display,
name='Train-Test Difference (abs)',
marker=dict(symbol='circle', size=15),
line=dict(color='#aa57b5', width=5),
text=s_difference_to_display.round(2)
))
fig.add_trace(pd_series_to_trace(s_pps_train_to_display, 'train'))
fig.add_trace(pd_series_to_trace_with_diff(s_pps_test_to_display, 'test', -s_difference_to_display))

ret_value = {'train': s_pps_train.to_dict(), 'test': s_pps_test.to_dict(),
'train-test difference': s_difference.to_dict()}
Expand Down Expand Up @@ -203,16 +211,16 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe

# display only if not all scores are above min_pps_to_show
if any(s_train > min_pps_to_show) or any(s_test > min_pps_to_show):
s_difference_to_display = np.abs(s_difference).apply(lambda x: 0 if x < 0 else x)
s_difference_to_display = s_difference_to_display.sort_values(ascending=False).head(n_show_top)
sorted_order_for_display = np.abs(s_difference).sort_values(ascending=False).head(n_show_top).index

s_train_to_display = s_train[s_difference_to_display.index]
s_test_to_display = s_test[s_difference_to_display.index]
s_train_to_display = s_train[sorted_order_for_display]
s_test_to_display = s_test[sorted_order_for_display]
s_difference_to_display = s_difference[sorted_order_for_display]

fig = get_pps_figure(per_class=True)
fig.update_layout(title=f'{feature}: Predictive Power Score (PPS) Per Class')
fig.add_trace(pps_df_to_trace(s_train_to_display, 'train'))
fig.add_trace(pps_df_to_trace(s_test_to_display, 'test'))
fig.add_trace(pd_series_to_trace(s_train_to_display, 'train'))
fig.add_trace(pd_series_to_trace_with_diff(s_test_to_display, 'test', -s_difference_to_display))
display.append(fig)

return ret_value, display
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import deepchecks.ppscore as pps
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core.check_utils.single_feature_contribution_utils import (
get_pps_figure, pps_df_to_trace)
get_pps_figure, pd_series_to_trace)
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.utils.strings import format_number
from deepchecks.utils.typing import Hashable
Expand Down Expand Up @@ -91,7 +91,7 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
top_to_show = s_ppscore.head(self.n_top_features)

fig = get_pps_figure(per_class=False)
fig.add_trace(pps_df_to_trace(top_to_show, dataset_type))
fig.add_trace(pd_series_to_trace(top_to_show, dataset_type))

text = [
'The Predictive Power Score (PPS) is used to estimate the ability of a feature to predict the '
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#
"""The single_feature_contribution check module."""
import typing as t
from copy import copy

import numpy as np

from deepchecks.core import CheckResult, ConditionResult
from deepchecks.core.check_utils.single_feature_contribution_utils import \
Expand Down Expand Up @@ -119,22 +122,34 @@ def run_logic(self, context: Context) -> CheckResult:

return CheckResult(value=ret_value, display=display, header='Single Feature Contribution Train-Test')

def add_condition_feature_pps_difference_not_greater_than(self: FC, threshold: float = 0.2) -> FC:
def add_condition_feature_pps_difference_not_greater_than(self: FC, threshold: float = 0.2,
include_negative_diff: bool = True) -> FC:
"""Add new condition.
Add condition that will check that difference between train
dataset feature pps and test dataset feature pps is not greater than X.
Parameters
----------
threshold : float , default: 0.2
train test ps difference upper bound.
threshold: float, default: 0.2
train test pps difference upper bound.
include_negative_diff: bool, default True
This parameter decides whether the condition checks the absolute value of the difference, or just the
positive value.
The difference is calculated as train PPS minus test PPS. This is because we're interested in the case
where the test dataset is less predictive of the label than the train dataset, as this could indicate
leakage of labels into the train dataset.
"""

def condition(value: t.Dict[Hashable, t.Dict[Hashable, float]]) -> ConditionResult:

diff_dict = copy(value['train-test difference'])
if include_negative_diff is True:
diff_dict = {k: np.abs(v) for k, v in diff_dict.items()}

failed_features = {
feature_name: format_number(pps_diff)
for feature_name, pps_diff in value['train-test difference'].items()
for feature_name, pps_diff in diff_dict.items()
if pps_diff > threshold
}

Expand Down
12 changes: 9 additions & 3 deletions deepchecks/utils/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,15 +412,21 @@ def truncate_zero_percent(ratio: float, floating_point: int):
return f'{ratio * 100:.{floating_point}f}'.rstrip('0').rstrip('.') + '%'


def format_percent(ratio: float, floating_point: int = 2, scientific_notation_threshold: int = 4) -> str:
def format_percent(ratio: float, floating_point: int = 2, scientific_notation_threshold: int = 4,
add_positive_prefix: bool = False) -> str:
"""Format percent for elegant display.
Parameters
----------
ratio : float
Ratio to be displayed as percent
floating_point : int , default: 2
floating_point: int , default: 2
Number of floating points to display
scientific_notation_threshold: int, default: 4
Max number of floating points for which to show number as float. If number of floating points is larger than
this parameter, scientific notation (e.g. "10E-5%") will be shown.
add_positive_prefix: bool, default: False
add plus sign before positive percentages (minus sign is always added for negative percentages).
Returns
-------
str
Expand All @@ -431,7 +437,7 @@ def format_percent(ratio: float, floating_point: int = 2, scientific_notation_th
ratio = -ratio
prefix = '-'
else:
prefix = ''
prefix = '+' if add_positive_prefix and ratio != 0 else ''

if int(ratio) == ratio:
result = f'{int(ratio) * 100}%'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#
"""Module contains the simple feature distribution check."""
from collections import defaultdict
from copy import copy
from typing import Callable, Dict, Hashable, TypeVar, Union

import numpy as np
Expand Down Expand Up @@ -232,7 +233,8 @@ def is_float_column(col: pd.Series) -> bool:

return (col.round() != col).any()

def add_condition_feature_pps_difference_not_greater_than(self: SFC, threshold: float = 0.2) -> SFC:
def add_condition_feature_pps_difference_not_greater_than(self: SFC, threshold: float = 0.2,
include_negative_diff: bool = False) -> SFC:
"""Add new condition.
Add condition that will check that difference between train
Expand All @@ -244,27 +246,38 @@ def add_condition_feature_pps_difference_not_greater_than(self: SFC, threshold:
----------
threshold : float , default: 0.2
train test ps difference upper bound.
include_negative_diff: bool, default True
This parameter decides whether the condition checks the absolute value of the difference, or just the
positive value.
The difference is calculated as train PPS minus test PPS. This is because we're interested in the case
where the test dataset is less predictive of the label than the train dataset, as this could indicate
leakage of labels into the train dataset.
Returns
-------
SFC
"""

def condition(value: Dict[Hashable, Dict[Hashable, float]]) -> ConditionResult:
def condition(value: Union[Dict[Hashable, Dict[Hashable, float]],
Dict[Hashable, Dict[Hashable, Dict[Hashable, float]]]],
) -> ConditionResult:

if self.per_class is True:
failed_features = {
feature_name: format_number(pps_value)
for feature_name, pps_value in
zip(value.keys(), [max(value[f]['train-test difference'].values()) for f in value.keys()])
if np.abs(pps_value) > threshold
}
diff_dict = {f: max(value[f]['train-test difference'].values()) for f in value.keys()}
if include_negative_diff is True:
diff_dict = {f: max(np.abs(value[f]['train-test difference'].values())) for f in value.keys()}

else:
failed_features = {
feature_name: format_number(pps_value)
for feature_name, pps_value in value['train-test difference'].items()
if np.abs(pps_value) > threshold
}
diff_dict = copy(value['train-test difference'])
if include_negative_diff is True:
diff_dict = {k: np.abs(v) for k, v in diff_dict.items()}

failed_features = {
feature_name: format_number(pps_value)
for feature_name, pps_value in diff_dict.items()
if pps_value > threshold
}

if failed_features:
message = f'Features with PPS difference above threshold: {failed_features}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,49 @@ def test_all_features_pps_upper_bound_condition_that_should_pass():
))


def test_train_test_condition_pps_difference_pass():
def test_train_test_condition_pps_positive_difference_pass():
# Arrange
df, df2, expected = util_generate_second_similar_dataframe_and_expected()
condition_value = 0.4
check = SingleFeatureContributionTrainTest(random_state=42).\
add_condition_feature_pps_difference_not_greater_than(threshold=condition_value, include_negative_diff=False)

# Act
result = SingleFeatureContributionTrainTest(random_state=42).run(
train_dataset=Dataset(df, label='label'), test_dataset=Dataset(df2, label='label'))
condition_result, *_ = check.conditions_decision(result)

# Assert
assert_that(condition_result, equal_condition_result(
is_pass=True,
name=f'Train-Test features\' Predictive Power Score difference is not greater than {condition_value}'
))


def test_train_test_condition_pps_positive_difference_fail():
# Arrange
df, df2, expected = util_generate_second_similar_dataframe_and_expected()
condition_value = 0.01
check = SingleFeatureContributionTrainTest(random_state=42).\
add_condition_feature_pps_difference_not_greater_than(condition_value, include_negative_diff=False)

# Act
result = SingleFeatureContributionTrainTest(random_state=42).run(train_dataset=Dataset(df, label='label'),
test_dataset=Dataset(df2, label='label'))
condition_result, *_ = check.conditions_decision(result)

# Assert
assert_that(condition_result, equal_condition_result(
is_pass=False,
name=f'Train-Test features\' Predictive Power Score difference is not greater than {condition_value}',
details='Features with PPS difference above threshold: {\'x2\': \'0.31\'}'
))


def test_train_test_condition_pps_difference_pass():
# Arrange
df, df2, expected = util_generate_second_similar_dataframe_and_expected()
condition_value = 0.6
check = SingleFeatureContributionTrainTest(random_state=42
).add_condition_feature_pps_difference_not_greater_than(condition_value)

Expand All @@ -199,7 +238,7 @@ def test_train_test_condition_pps_difference_pass():
def test_train_test_condition_pps_difference_fail():
# Arrange
df, df2, expected = util_generate_second_similar_dataframe_and_expected()
condition_value = 0.01
condition_value = 0.4
check = SingleFeatureContributionTrainTest(random_state=42
).add_condition_feature_pps_difference_not_greater_than(condition_value)

Expand All @@ -212,7 +251,7 @@ def test_train_test_condition_pps_difference_fail():
assert_that(condition_result, equal_condition_result(
is_pass=False,
name=f'Train-Test features\' Predictive Power Score difference is not greater than {condition_value}',
details='Features with PPS difference above threshold: {\'x2\': \'0.31\'}'
details='Features with PPS difference above threshold: {\'x3\': \'0.54\'}'
))


Expand Down

0 comments on commit 3942d0e

Please sign in to comment.