Skip to content

Commit

Permalink
Rename single feature contribution to feature label correlation 1399 (#…
Browse files Browse the repository at this point in the history
…1498)

* renamed single feature contribution to Feature Label Correlation

* pylint
  • Loading branch information
nirhutnik committed May 24, 2022
1 parent d9029eb commit 067170e
Show file tree
Hide file tree
Showing 26 changed files with 215 additions and 224 deletions.
4 changes: 2 additions & 2 deletions deepchecks/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
# methodology checks
'BoostingOverfit',
'UnusedFeatures',
'SingleFeatureContribution',
'SingleFeatureContributionTrainTest',
'FeatureLabelCorrelation',
'FeatureLabelCorrelationChange',
'IndexTrainTestLeakage',
'TrainTestSamplesMix',
'DateTrainTestLeakageDuplicates',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module containing common SingleFeatureContribution (PPS) utils."""
"""Module containing common feature label correlation (PPS) utils."""
from typing import Optional

import numpy as np
Expand Down Expand Up @@ -67,14 +67,14 @@ def pd_series_to_trace_with_diff(s_pps: pd.Series, name: str, diffs: pd.Series):
)


def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
test_df: pd.DataFrame,
test_label_name: Optional[Hashable], ppscore_params: dict,
n_show_top: int,
min_pps_to_show: float = 0.05,
random_state: int = None):
def get_feature_label_correlation(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
test_df: pd.DataFrame,
test_label_name: Optional[Hashable], ppscore_params: dict,
n_show_top: int,
min_pps_to_show: float = 0.05,
random_state: int = None):
"""
Calculate the PPS for train, test and difference for single feature contribution checks.
Calculate the PPS for train, test and difference for feature label correlation checks.
The PPS represents the ability of a feature to single-handedly predict another feature or label.
This function calculates the PPS per feature for both train and test, and returns the data and display graph.
Expand Down Expand Up @@ -133,14 +133,14 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
return ret_value, display


def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
test_df: pd.DataFrame,
test_label_name: Optional[Hashable], ppscore_params: dict,
n_show_top: int,
min_pps_to_show: float = 0.05,
random_state: int = None):
def get_feature_label_correlation_per_class(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
test_df: pd.DataFrame,
test_label_name: Optional[Hashable], ppscore_params: dict,
n_show_top: int,
min_pps_to_show: float = 0.05,
random_state: int = None):
"""
Calculate the PPS for train, test and difference for single feature contribution checks per class.
Calculate the PPS for train, test and difference for feature label correlation checks per class.
The PPS represents the ability of a feature to single-handedly predict another feature or label.
This function calculates the PPS per feature for both train and test, and returns the data and display graph.
Expand Down
12 changes: 6 additions & 6 deletions deepchecks/tabular/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
# ----------------------------------------------------------------------------
#
"""Module importing all tabular checks."""
from .data_integrity import (ColumnsInfo, ConflictingLabels, DataDuplicates, IsSingleValue, MixedDataTypes, MixedNulls,
OutlierSampleDetection, SingleFeatureContribution, SpecialCharacters,
from .data_integrity import (ColumnsInfo, ConflictingLabels, DataDuplicates, FeatureLabelCorrelation, IsSingleValue,
MixedDataTypes, MixedNulls, OutlierSampleDetection, SpecialCharacters,
StringLengthOutOfBounds, StringMismatch)
from .model_evaluation import (BoostingOverfit, CalibrationScore, ConfusionMatrixReport, ModelErrorAnalysis,
ModelInferenceTime, ModelInfo, MultiModelPerformanceReport, PerformanceReport,
RegressionErrorDistribution, RegressionSystematicError, RocReport, SegmentPerformance,
SimpleModelComparison, TrainTestPredictionDrift, UnusedFeatures)
from .train_test_validation import (CategoryMismatchTrainTest, DatasetsSizeComparison, DateTrainTestLeakageDuplicates,
DateTrainTestLeakageOverlap, DominantFrequencyChange, IdentifierLeakage,
IndexTrainTestLeakage, NewLabelTrainTest, SingleFeatureContributionTrainTest,
DateTrainTestLeakageOverlap, DominantFrequencyChange, FeatureLabelCorrelationChange,
IdentifierLeakage, IndexTrainTestLeakage, NewLabelTrainTest,
StringMismatchComparison, TrainTestFeatureDrift, TrainTestLabelDrift,
TrainTestSamplesMix, WholeDatasetDrift)

Expand All @@ -41,8 +41,8 @@
# methodology checks
'BoostingOverfit',
'UnusedFeatures',
'SingleFeatureContribution',
'SingleFeatureContributionTrainTest',
'FeatureLabelCorrelation',
'FeatureLabelCorrelationChange',
'IndexTrainTestLeakage',
'TrainTestSamplesMix',
'DateTrainTestLeakageDuplicates',
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/tabular/checks/data_integrity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from .columns_info import ColumnsInfo
from .conflicting_labels import ConflictingLabels
from .data_duplicates import DataDuplicates
from .feature_label_correlation import FeatureLabelCorrelation
from .is_single_value import IsSingleValue
from .mixed_data_types import MixedDataTypes
from .mixed_nulls import MixedNulls
from .outlier_sample_detection import OutlierSampleDetection
from .single_feature_contribution import SingleFeatureContribution
from .special_chars import SpecialCharacters
from .string_length_out_of_bounds import StringLengthOutOfBounds
from .string_mismatch import StringMismatch
Expand All @@ -32,5 +32,5 @@
'DataDuplicates',
'ConflictingLabels',
'OutlierSampleDetection',
'SingleFeatureContribution',
'FeatureLabelCorrelation',
]
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,30 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""The single_feature_contribution check module."""
"""The feature label correlation check module."""
import typing as t

import deepchecks.ppscore as pps
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core.check_utils.single_feature_contribution_utils import get_pps_figure, pd_series_to_trace
from deepchecks.core.check_utils.feature_label_correlation_utils import get_pps_figure, pd_series_to_trace
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.tabular.utils.messages import get_condition_passed_message
from deepchecks.utils.strings import format_number
from deepchecks.utils.typing import Hashable

__all__ = ['SingleFeatureContribution']
__all__ = ['FeatureLabelCorrelation']


FC = t.TypeVar('FC', bound='SingleFeatureContribution')
FLC = t.TypeVar('FLC', bound='FeatureLabelCorrelation')


pps_url = 'https://docs.deepchecks.com/en/stable/examples/tabular/' \
'checks/methodology/single_feature_contribution_train_test' \
'checks/train_test_validation/feature_label_correlation_change' \
'.html?utm_source=display_output&utm_medium=referral&utm_campaign=check_link'
pps_html = f'<a href={pps_url} target="_blank">Predictive Power Score</a>'


class SingleFeatureContribution(SingleDatasetCheck):
class FeatureLabelCorrelation(SingleDatasetCheck):
"""Return the PPS (Predictive Power Score) of all features in relation to the label.
The PPS represents the ability of a feature to single-handedly predict another feature or label.
Expand Down Expand Up @@ -103,9 +103,9 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
# display only if not all scores are 0
display = [fig, *text] if s_ppscore.sum() else None

return CheckResult(value=s_ppscore.to_dict(), display=display, header='Single Feature Contribution')
return CheckResult(value=s_ppscore.to_dict(), display=display, header='Feature Label Correlation')

def add_condition_feature_pps_not_greater_than(self: FC, threshold: float = 0.8) -> FC:
def add_condition_feature_pps_not_greater_than(self: FLC, threshold: float = 0.8) -> FLC:
"""
Add condition that will check that pps of the specified feature(s) is not greater than X.
Expand All @@ -115,7 +115,7 @@ def add_condition_feature_pps_not_greater_than(self: FC, threshold: float = 0.8)
pps upper bound
Returns
-------
FC
FLC
"""
def condition(value: t.Dict[Hashable, float]) -> ConditionResult:
failed_features = {
Expand Down
10 changes: 5 additions & 5 deletions deepchecks/tabular/checks/methodology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@
"""
import warnings

from ..data_integrity import SingleFeatureContribution
from ..data_integrity import FeatureLabelCorrelation
from ..model_evaluation import BoostingOverfit, ModelInferenceTime, UnusedFeatures
from ..train_test_validation import (DatasetsSizeComparison, DateTrainTestLeakageDuplicates,
DateTrainTestLeakageOverlap, IdentifierLeakage, IndexTrainTestLeakage,
SingleFeatureContributionTrainTest, TrainTestSamplesMix)
DateTrainTestLeakageOverlap, FeatureLabelCorrelationChange, IdentifierLeakage,
IndexTrainTestLeakage, TrainTestSamplesMix)

__all__ = [
'BoostingOverfit',
'UnusedFeatures',
'SingleFeatureContribution',
'SingleFeatureContributionTrainTest',
'FeatureLabelCorrelation',
'FeatureLabelCorrelationChange',
'IndexTrainTestLeakage',
'TrainTestSamplesMix',
'DateTrainTestLeakageDuplicates',
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/tabular/checks/train_test_validation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from .date_train_test_leakage_duplicates import DateTrainTestLeakageDuplicates
from .date_train_test_leakage_overlap import DateTrainTestLeakageOverlap
from .dominant_frequency_change import DominantFrequencyChange
from .feature_label_correlation_change import FeatureLabelCorrelationChange
from .identifier_leakage import IdentifierLeakage
from .index_leakage import IndexTrainTestLeakage
from .new_label_train_test import NewLabelTrainTest
from .single_feature_contribution_train_test import SingleFeatureContributionTrainTest
from .string_mismatch_comparison import StringMismatchComparison
from .train_test_feature_drift import TrainTestFeatureDrift
from .train_test_label_drift import TrainTestLabelDrift
Expand All @@ -34,7 +34,7 @@
'IdentifierLeakage',
'IndexTrainTestLeakage',
'NewLabelTrainTest',
'SingleFeatureContributionTrainTest',
'FeatureLabelCorrelationChange',
'StringMismatchComparison',
'TrainTestFeatureDrift',
'TrainTestLabelDrift',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""The single_feature_contribution check module."""
"""The feature label correlation change check module."""
import typing as t
from copy import copy

import numpy as np

from deepchecks.core import CheckResult, ConditionResult
from deepchecks.core.check_utils.single_feature_contribution_utils import get_single_feature_contribution
from deepchecks.core.check_utils.feature_label_correlation_utils import get_feature_label_correlation
from deepchecks.core.condition import ConditionCategory
from deepchecks.tabular import Context, TrainTestCheck
from deepchecks.tabular.utils.messages import get_condition_passed_message
from deepchecks.utils.strings import format_number
from deepchecks.utils.typing import Hashable

__all__ = ['SingleFeatureContributionTrainTest']
__all__ = ['FeatureLabelCorrelationChange']

FC = t.TypeVar('FC', bound='SingleFeatureContributionTrainTest')
FLC = t.TypeVar('FLC', bound='FeatureLabelCorrelationChange')

pps_url = 'https://docs.deepchecks.com/en/stable/examples/tabular/' \
'checks/methodology/single_feature_contribution_train_test' \
'checks/train_test_validation/feature_label_correlation_change' \
'.html?utm_source=display_output&utm_medium=referral&utm_campaign=check_link'
pps_html = f'<a href={pps_url} target="_blank">Predictive Power Score</a>'


class SingleFeatureContributionTrainTest(TrainTestCheck):
class FeatureLabelCorrelationChange(TrainTestCheck):
"""
Return the Predictive Power Score of all features, in order to estimate each feature's ability to predict the label.
Expand Down Expand Up @@ -109,21 +109,21 @@ def run_logic(self, context: Context) -> CheckResult:
'the target label.'
]

ret_value, display = get_single_feature_contribution(train_dataset.data[relevant_columns],
train_dataset.label_name,
test_dataset.data[relevant_columns],
test_dataset.label_name, self.ppscore_params,
self.n_top_features,
min_pps_to_show=self.min_pps_to_show,
random_state=self.random_state)
ret_value, display = get_feature_label_correlation(train_dataset.data[relevant_columns],
train_dataset.label_name,
test_dataset.data[relevant_columns],
test_dataset.label_name, self.ppscore_params,
self.n_top_features,
min_pps_to_show=self.min_pps_to_show,
random_state=self.random_state)

if display:
display += text

return CheckResult(value=ret_value, display=display, header='Single Feature Contribution Train-Test')
return CheckResult(value=ret_value, display=display, header='Feature Label Correlation Change')

def add_condition_feature_pps_difference_not_greater_than(self: FC, threshold: float = 0.2,
include_negative_diff: bool = True) -> FC:
def add_condition_feature_pps_difference_not_greater_than(self: FLC, threshold: float = 0.2,
include_negative_diff: bool = True) -> FLC:
"""Add new condition.
Add condition that will check that difference between train
Expand Down Expand Up @@ -163,7 +163,7 @@ def condition(value: t.Dict[Hashable, t.Dict[Hashable, float]]) -> ConditionResu
return self.add_condition(f'Train-Test features\' Predictive Power Score difference is not greater than '
f'{format_number(threshold)}', condition)

def add_condition_feature_pps_in_train_not_greater_than(self: FC, threshold: float = 0.7) -> FC:
def add_condition_feature_pps_in_train_not_greater_than(self: FLC, threshold: float = 0.7) -> FLC:
"""Add new condition.
Add condition that will check that train dataset feature pps is not greater than X.
Expand All @@ -175,7 +175,7 @@ def add_condition_feature_pps_in_train_not_greater_than(self: FC, threshold: flo
Returns
-------
FC
FLC
"""

def condition(value: t.Dict[Hashable, t.Dict[Hashable, float]]) -> ConditionResult:
Expand Down
21 changes: 10 additions & 11 deletions deepchecks/tabular/suites/default_suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
from deepchecks.tabular.checks import (BoostingOverfit, CalibrationScore, CategoryMismatchTrainTest, ConflictingLabels,
ConfusionMatrixReport, DataDuplicates, DatasetsSizeComparison,
DateTrainTestLeakageDuplicates, DateTrainTestLeakageOverlap,
DominantFrequencyChange, IdentifierLeakage, IndexTrainTestLeakage, IsSingleValue,
MixedDataTypes, MixedNulls, ModelErrorAnalysis, ModelInferenceTime,
NewLabelTrainTest, OutlierSampleDetection, PerformanceReport,
RegressionErrorDistribution, RegressionSystematicError, RocReport,
SegmentPerformance, SimpleModelComparison, SingleFeatureContribution,
SingleFeatureContributionTrainTest, SpecialCharacters, StringLengthOutOfBounds,
StringMismatch, StringMismatchComparison, TrainTestFeatureDrift,
TrainTestLabelDrift, TrainTestPredictionDrift, TrainTestSamplesMix,
UnusedFeatures, WholeDatasetDrift)
DominantFrequencyChange, FeatureLabelCorrelation, FeatureLabelCorrelationChange,
IdentifierLeakage, IndexTrainTestLeakage, IsSingleValue, MixedDataTypes,
MixedNulls, ModelErrorAnalysis, ModelInferenceTime, NewLabelTrainTest,
OutlierSampleDetection, PerformanceReport, RegressionErrorDistribution,
RegressionSystematicError, RocReport, SegmentPerformance, SimpleModelComparison,
SpecialCharacters, StringLengthOutOfBounds, StringMismatch,
StringMismatchComparison, TrainTestFeatureDrift, TrainTestLabelDrift,
TrainTestPredictionDrift, TrainTestSamplesMix, UnusedFeatures, WholeDatasetDrift)

__all__ = ['single_dataset_integrity', 'train_test_leakage', 'train_test_validation',
'model_evaluation', 'full_suite']
Expand Down Expand Up @@ -61,7 +60,7 @@ def data_integrity() -> Suite:
StringLengthOutOfBounds().add_condition_ratio_of_outliers_not_greater_than(),
ConflictingLabels().add_condition_ratio_of_conflicting_labels_not_greater_than(),
OutlierSampleDetection(),
SingleFeatureContribution().add_condition_feature_pps_not_greater_than()
FeatureLabelCorrelation().add_condition_feature_pps_not_greater_than()
)


Expand Down Expand Up @@ -95,7 +94,7 @@ def train_test_validation() -> Suite:
IndexTrainTestLeakage().add_condition_ratio_not_greater_than(),
IdentifierLeakage().add_condition_pps_not_greater_than(),
TrainTestSamplesMix().add_condition_duplicates_ratio_not_greater_than(),
SingleFeatureContributionTrainTest().add_condition_feature_pps_difference_not_greater_than()
FeatureLabelCorrelationChange().add_condition_feature_pps_difference_not_greater_than()
.add_condition_feature_pps_in_train_not_greater_than(),
TrainTestFeatureDrift().add_condition_drift_score_not_greater_than(),
TrainTestLabelDrift().add_condition_drift_score_not_greater_than(),
Expand Down

0 comments on commit 067170e

Please sign in to comment.