Rename single feature contribution to feature label correlation 1399 (#…

…1498) * renamed single feature contribution to Feature Label Correlation * pylint
deepchecks · May 24, 2022 · 067170e · 067170e
1 parent d9029eb
commit 067170e
Show file tree

Hide file tree

Showing 26 changed files with 215 additions and 224 deletions.
diff --git a/deepchecks/checks.py b/deepchecks/checks.py
@@ -44,8 +44,8 @@
     # methodology checks
     'BoostingOverfit',
     'UnusedFeatures',
-    'SingleFeatureContribution',
-    'SingleFeatureContributionTrainTest',
+    'FeatureLabelCorrelation',
+    'FeatureLabelCorrelationChange',
     'IndexTrainTestLeakage',
     'TrainTestSamplesMix',
     'DateTrainTestLeakageDuplicates',

diff --git a/...tils/single_feature_contribution_utils.py → ..._utils/feature_label_correlation_utils.py b/...tils/single_feature_contribution_utils.py → ..._utils/feature_label_correlation_utils.py
@@ -8,7 +8,7 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-"""Module containing common SingleFeatureContribution (PPS) utils."""
+"""Module containing common feature label correlation (PPS) utils."""
 from typing import Optional
 
 import numpy as np
@@ -67,14 +67,14 @@ def pd_series_to_trace_with_diff(s_pps: pd.Series, name: str, diffs: pd.Series):
                   )
 
 
-def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
-                                    test_df: pd.DataFrame,
-                                    test_label_name: Optional[Hashable], ppscore_params: dict,
-                                    n_show_top: int,
-                                    min_pps_to_show: float = 0.05,
-                                    random_state: int = None):
+def get_feature_label_correlation(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
+                                  test_df: pd.DataFrame,
+                                  test_label_name: Optional[Hashable], ppscore_params: dict,
+                                  n_show_top: int,
+                                  min_pps_to_show: float = 0.05,
+                                  random_state: int = None):
     """
-    Calculate the PPS for train, test and difference for single feature contribution checks.
+    Calculate the PPS for train, test and difference for feature label correlation checks.
 
     The PPS represents the ability of a feature to single-handedly predict another feature or label.
     This function calculates the PPS per feature for both train and test, and returns the data and display graph.
@@ -133,14 +133,14 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
     return ret_value, display
 
 
-def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
-                                              test_df: pd.DataFrame,
-                                              test_label_name: Optional[Hashable], ppscore_params: dict,
-                                              n_show_top: int,
-                                              min_pps_to_show: float = 0.05,
-                                              random_state: int = None):
+def get_feature_label_correlation_per_class(train_df: pd.DataFrame, train_label_name: Optional[Hashable],
+                                            test_df: pd.DataFrame,
+                                            test_label_name: Optional[Hashable], ppscore_params: dict,
+                                            n_show_top: int,
+                                            min_pps_to_show: float = 0.05,
+                                            random_state: int = None):
     """
-    Calculate the PPS for train, test and difference for single feature contribution checks per class.
+    Calculate the PPS for train, test and difference for feature label correlation checks per class.
 
     The PPS represents the ability of a feature to single-handedly predict another feature or label.
     This function calculates the PPS per feature for both train and test, and returns the data and display graph.

diff --git a/deepchecks/tabular/checks/__init__.py b/deepchecks/tabular/checks/__init__.py
@@ -9,16 +9,16 @@
 # ----------------------------------------------------------------------------
 #
 """Module importing all tabular checks."""
-from .data_integrity import (ColumnsInfo, ConflictingLabels, DataDuplicates, IsSingleValue, MixedDataTypes, MixedNulls,
-                             OutlierSampleDetection, SingleFeatureContribution, SpecialCharacters,
+from .data_integrity import (ColumnsInfo, ConflictingLabels, DataDuplicates, FeatureLabelCorrelation, IsSingleValue,
+                             MixedDataTypes, MixedNulls, OutlierSampleDetection, SpecialCharacters,
                              StringLengthOutOfBounds, StringMismatch)
 from .model_evaluation import (BoostingOverfit, CalibrationScore, ConfusionMatrixReport, ModelErrorAnalysis,
                                ModelInferenceTime, ModelInfo, MultiModelPerformanceReport, PerformanceReport,
                                RegressionErrorDistribution, RegressionSystematicError, RocReport, SegmentPerformance,
                                SimpleModelComparison, TrainTestPredictionDrift, UnusedFeatures)
 from .train_test_validation import (CategoryMismatchTrainTest, DatasetsSizeComparison, DateTrainTestLeakageDuplicates,
-                                    DateTrainTestLeakageOverlap, DominantFrequencyChange, IdentifierLeakage,
-                                    IndexTrainTestLeakage, NewLabelTrainTest, SingleFeatureContributionTrainTest,
+                                    DateTrainTestLeakageOverlap, DominantFrequencyChange, FeatureLabelCorrelationChange,
+                                    IdentifierLeakage, IndexTrainTestLeakage, NewLabelTrainTest,
                                     StringMismatchComparison, TrainTestFeatureDrift, TrainTestLabelDrift,
                                     TrainTestSamplesMix, WholeDatasetDrift)
 
@@ -41,8 +41,8 @@
     # methodology checks
     'BoostingOverfit',
     'UnusedFeatures',
-    'SingleFeatureContribution',
-    'SingleFeatureContributionTrainTest',
+    'FeatureLabelCorrelation',
+    'FeatureLabelCorrelationChange',
     'IndexTrainTestLeakage',
     'TrainTestSamplesMix',
     'DateTrainTestLeakageDuplicates',

diff --git a/deepchecks/tabular/checks/data_integrity/__init__.py b/deepchecks/tabular/checks/data_integrity/__init__.py
@@ -12,11 +12,11 @@
 from .columns_info import ColumnsInfo
 from .conflicting_labels import ConflictingLabels
 from .data_duplicates import DataDuplicates
+from .feature_label_correlation import FeatureLabelCorrelation
 from .is_single_value import IsSingleValue
 from .mixed_data_types import MixedDataTypes
 from .mixed_nulls import MixedNulls
 from .outlier_sample_detection import OutlierSampleDetection
-from .single_feature_contribution import SingleFeatureContribution
 from .special_chars import SpecialCharacters
 from .string_length_out_of_bounds import StringLengthOutOfBounds
 from .string_mismatch import StringMismatch
@@ -32,5 +32,5 @@
     'DataDuplicates',
     'ConflictingLabels',
     'OutlierSampleDetection',
-    'SingleFeatureContribution',
+    'FeatureLabelCorrelation',
 ]
diff --git a/..._integrity/single_feature_contribution.py → ...ta_integrity/feature_label_correlation.py b/..._integrity/single_feature_contribution.py → ...ta_integrity/feature_label_correlation.py
@@ -8,30 +8,30 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-"""The single_feature_contribution check module."""
+"""The feature label correlation check module."""
 import typing as t
 
 import deepchecks.ppscore as pps
 from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
-from deepchecks.core.check_utils.single_feature_contribution_utils import get_pps_figure, pd_series_to_trace
+from deepchecks.core.check_utils.feature_label_correlation_utils import get_pps_figure, pd_series_to_trace
 from deepchecks.tabular import Context, SingleDatasetCheck
 from deepchecks.tabular.utils.messages import get_condition_passed_message
 from deepchecks.utils.strings import format_number
 from deepchecks.utils.typing import Hashable
 
-__all__ = ['SingleFeatureContribution']
+__all__ = ['FeatureLabelCorrelation']
 
 
-FC = t.TypeVar('FC', bound='SingleFeatureContribution')
+FLC = t.TypeVar('FLC', bound='FeatureLabelCorrelation')
 
 
 pps_url = 'https://docs.deepchecks.com/en/stable/examples/tabular/' \
-          'checks/methodology/single_feature_contribution_train_test' \
+          'checks/train_test_validation/feature_label_correlation_change' \
           '.html?utm_source=display_output&utm_medium=referral&utm_campaign=check_link'
 pps_html = f'<a href={pps_url} target="_blank">Predictive Power Score</a>'
 
 
-class SingleFeatureContribution(SingleDatasetCheck):
+class FeatureLabelCorrelation(SingleDatasetCheck):
     """Return the PPS (Predictive Power Score) of all features in relation to the label.
 
     The PPS represents the ability of a feature to single-handedly predict another feature or label.
@@ -103,9 +103,9 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
         # display only if not all scores are 0
         display = [fig, *text] if s_ppscore.sum() else None
 
-        return CheckResult(value=s_ppscore.to_dict(), display=display, header='Single Feature Contribution')
+        return CheckResult(value=s_ppscore.to_dict(), display=display, header='Feature Label Correlation')
 
-    def add_condition_feature_pps_not_greater_than(self: FC, threshold: float = 0.8) -> FC:
+    def add_condition_feature_pps_not_greater_than(self: FLC, threshold: float = 0.8) -> FLC:
         """
         Add condition that will check that pps of the specified feature(s) is not greater than X.
 
@@ -115,7 +115,7 @@ def add_condition_feature_pps_not_greater_than(self: FC, threshold: float = 0.8)
             pps upper bound
         Returns
         -------
-        FC
+        FLC
         """
         def condition(value: t.Dict[Hashable, float]) -> ConditionResult:
             failed_features = {

diff --git a/deepchecks/tabular/checks/methodology/__init__.py b/deepchecks/tabular/checks/methodology/__init__.py
@@ -19,17 +19,17 @@
 """
 import warnings
 
-from ..data_integrity import SingleFeatureContribution
+from ..data_integrity import FeatureLabelCorrelation
 from ..model_evaluation import BoostingOverfit, ModelInferenceTime, UnusedFeatures
 from ..train_test_validation import (DatasetsSizeComparison, DateTrainTestLeakageDuplicates,
-                                     DateTrainTestLeakageOverlap, IdentifierLeakage, IndexTrainTestLeakage,
-                                     SingleFeatureContributionTrainTest, TrainTestSamplesMix)
+                                     DateTrainTestLeakageOverlap, FeatureLabelCorrelationChange, IdentifierLeakage,
+                                     IndexTrainTestLeakage, TrainTestSamplesMix)
 
 __all__ = [
     'BoostingOverfit',
     'UnusedFeatures',
-    'SingleFeatureContribution',
-    'SingleFeatureContributionTrainTest',
+    'FeatureLabelCorrelation',
+    'FeatureLabelCorrelationChange',
     'IndexTrainTestLeakage',
     'TrainTestSamplesMix',
     'DateTrainTestLeakageDuplicates',

diff --git a/deepchecks/tabular/checks/train_test_validation/__init__.py b/deepchecks/tabular/checks/train_test_validation/__init__.py
@@ -15,10 +15,10 @@
 from .date_train_test_leakage_duplicates import DateTrainTestLeakageDuplicates
 from .date_train_test_leakage_overlap import DateTrainTestLeakageOverlap
 from .dominant_frequency_change import DominantFrequencyChange
+from .feature_label_correlation_change import FeatureLabelCorrelationChange
 from .identifier_leakage import IdentifierLeakage
 from .index_leakage import IndexTrainTestLeakage
 from .new_label_train_test import NewLabelTrainTest
-from .single_feature_contribution_train_test import SingleFeatureContributionTrainTest
 from .string_mismatch_comparison import StringMismatchComparison
 from .train_test_feature_drift import TrainTestFeatureDrift
 from .train_test_label_drift import TrainTestLabelDrift
@@ -34,7 +34,7 @@
     'IdentifierLeakage',
     'IndexTrainTestLeakage',
     'NewLabelTrainTest',
-    'SingleFeatureContributionTrainTest',
+    'FeatureLabelCorrelationChange',
     'StringMismatchComparison',
     'TrainTestFeatureDrift',
     'TrainTestLabelDrift',

diff --git a/...single_feature_contribution_train_test.py → ...ation/feature_label_correlation_change.py b/...single_feature_contribution_train_test.py → ...ation/feature_label_correlation_change.py
@@ -8,31 +8,31 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-"""The single_feature_contribution check module."""
+"""The feature label correlation change check module."""
 import typing as t
 from copy import copy
 
 import numpy as np
 
 from deepchecks.core import CheckResult, ConditionResult
-from deepchecks.core.check_utils.single_feature_contribution_utils import get_single_feature_contribution
+from deepchecks.core.check_utils.feature_label_correlation_utils import get_feature_label_correlation
 from deepchecks.core.condition import ConditionCategory
 from deepchecks.tabular import Context, TrainTestCheck
 from deepchecks.tabular.utils.messages import get_condition_passed_message
 from deepchecks.utils.strings import format_number
 from deepchecks.utils.typing import Hashable
 
-__all__ = ['SingleFeatureContributionTrainTest']
+__all__ = ['FeatureLabelCorrelationChange']
 
-FC = t.TypeVar('FC', bound='SingleFeatureContributionTrainTest')
+FLC = t.TypeVar('FLC', bound='FeatureLabelCorrelationChange')
 
 pps_url = 'https://docs.deepchecks.com/en/stable/examples/tabular/' \
-          'checks/methodology/single_feature_contribution_train_test' \
+          'checks/train_test_validation/feature_label_correlation_change' \
           '.html?utm_source=display_output&utm_medium=referral&utm_campaign=check_link'
 pps_html = f'<a href={pps_url} target="_blank">Predictive Power Score</a>'
 
 
-class SingleFeatureContributionTrainTest(TrainTestCheck):
+class FeatureLabelCorrelationChange(TrainTestCheck):
     """
     Return the Predictive Power Score of all features, in order to estimate each feature's ability to predict the label.
 
@@ -109,21 +109,21 @@ def run_logic(self, context: Context) -> CheckResult:
             'the target label.'
         ]
 
-        ret_value, display = get_single_feature_contribution(train_dataset.data[relevant_columns],
-                                                             train_dataset.label_name,
-                                                             test_dataset.data[relevant_columns],
-                                                             test_dataset.label_name, self.ppscore_params,
-                                                             self.n_top_features,
-                                                             min_pps_to_show=self.min_pps_to_show,
-                                                             random_state=self.random_state)
+        ret_value, display = get_feature_label_correlation(train_dataset.data[relevant_columns],
+                                                           train_dataset.label_name,
+                                                           test_dataset.data[relevant_columns],
+                                                           test_dataset.label_name, self.ppscore_params,
+                                                           self.n_top_features,
+                                                           min_pps_to_show=self.min_pps_to_show,
+                                                           random_state=self.random_state)
 
         if display:
             display += text
 
-        return CheckResult(value=ret_value, display=display, header='Single Feature Contribution Train-Test')
+        return CheckResult(value=ret_value, display=display, header='Feature Label Correlation Change')
 
-    def add_condition_feature_pps_difference_not_greater_than(self: FC, threshold: float = 0.2,
-                                                              include_negative_diff: bool = True) -> FC:
+    def add_condition_feature_pps_difference_not_greater_than(self: FLC, threshold: float = 0.2,
+                                                              include_negative_diff: bool = True) -> FLC:
         """Add new condition.
 
         Add condition that will check that difference between train
@@ -163,7 +163,7 @@ def condition(value: t.Dict[Hashable, t.Dict[Hashable, float]]) -> ConditionResu
         return self.add_condition(f'Train-Test features\' Predictive Power Score difference is not greater than '
                                   f'{format_number(threshold)}', condition)
 
-    def add_condition_feature_pps_in_train_not_greater_than(self: FC, threshold: float = 0.7) -> FC:
+    def add_condition_feature_pps_in_train_not_greater_than(self: FLC, threshold: float = 0.7) -> FLC:
         """Add new condition.
 
         Add condition that will check that train dataset feature pps is not greater than X.
@@ -175,7 +175,7 @@ def add_condition_feature_pps_in_train_not_greater_than(self: FC, threshold: flo
 
         Returns
         -------
-        FC
+        FLC
         """
 
         def condition(value: t.Dict[Hashable, t.Dict[Hashable, float]]) -> ConditionResult:

diff --git a/deepchecks/tabular/suites/default_suites.py b/deepchecks/tabular/suites/default_suites.py
@@ -19,15 +19,14 @@
 from deepchecks.tabular.checks import (BoostingOverfit, CalibrationScore, CategoryMismatchTrainTest, ConflictingLabels,
                                        ConfusionMatrixReport, DataDuplicates, DatasetsSizeComparison,
                                        DateTrainTestLeakageDuplicates, DateTrainTestLeakageOverlap,
-                                       DominantFrequencyChange, IdentifierLeakage, IndexTrainTestLeakage, IsSingleValue,
-                                       MixedDataTypes, MixedNulls, ModelErrorAnalysis, ModelInferenceTime,
-                                       NewLabelTrainTest, OutlierSampleDetection, PerformanceReport,
-                                       RegressionErrorDistribution, RegressionSystematicError, RocReport,
-                                       SegmentPerformance, SimpleModelComparison, SingleFeatureContribution,
-                                       SingleFeatureContributionTrainTest, SpecialCharacters, StringLengthOutOfBounds,
-                                       StringMismatch, StringMismatchComparison, TrainTestFeatureDrift,
-                                       TrainTestLabelDrift, TrainTestPredictionDrift, TrainTestSamplesMix,
-                                       UnusedFeatures, WholeDatasetDrift)
+                                       DominantFrequencyChange, FeatureLabelCorrelation, FeatureLabelCorrelationChange,
+                                       IdentifierLeakage, IndexTrainTestLeakage, IsSingleValue, MixedDataTypes,
+                                       MixedNulls, ModelErrorAnalysis, ModelInferenceTime, NewLabelTrainTest,
+                                       OutlierSampleDetection, PerformanceReport, RegressionErrorDistribution,
+                                       RegressionSystematicError, RocReport, SegmentPerformance, SimpleModelComparison,
+                                       SpecialCharacters, StringLengthOutOfBounds, StringMismatch,
+                                       StringMismatchComparison, TrainTestFeatureDrift, TrainTestLabelDrift,
+                                       TrainTestPredictionDrift, TrainTestSamplesMix, UnusedFeatures, WholeDatasetDrift)
 
 __all__ = ['single_dataset_integrity', 'train_test_leakage', 'train_test_validation',
            'model_evaluation', 'full_suite']
@@ -61,7 +60,7 @@ def data_integrity() -> Suite:
         StringLengthOutOfBounds().add_condition_ratio_of_outliers_not_greater_than(),
         ConflictingLabels().add_condition_ratio_of_conflicting_labels_not_greater_than(),
         OutlierSampleDetection(),
-        SingleFeatureContribution().add_condition_feature_pps_not_greater_than()
+        FeatureLabelCorrelation().add_condition_feature_pps_not_greater_than()
     )
 
 
@@ -95,7 +94,7 @@ def train_test_validation() -> Suite:
         IndexTrainTestLeakage().add_condition_ratio_not_greater_than(),
         IdentifierLeakage().add_condition_pps_not_greater_than(),
         TrainTestSamplesMix().add_condition_duplicates_ratio_not_greater_than(),
-        SingleFeatureContributionTrainTest().add_condition_feature_pps_difference_not_greater_than()
+        FeatureLabelCorrelationChange().add_condition_feature_pps_difference_not_greater_than()
         .add_condition_feature_pps_in_train_not_greater_than(),
         TrainTestFeatureDrift().add_condition_drift_score_not_greater_than(),
         TrainTestLabelDrift().add_condition_drift_score_not_greater_than(),