Weak segment - return single segment per feature argument (#2645)

deepchecks · Jul 26, 2023 · daad127 · daad127
1 parent deb16bc
commit daad127
Show file tree

Hide file tree

Showing 16 changed files with 115 additions and 49 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module of the under annotated segments check."""
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -41,8 +41,8 @@ class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
 
     def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
-                 segment_minimum_size_ratio: float, n_samples: int,  n_to_show: int,
-                 categorical_aggregation_threshold: float, **kwargs):
+                 segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
+                 categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -53,6 +53,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.n_to_show = n_to_show
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
         self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -81,11 +82,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         avg_score = round(score_per_sample.mean(), 3)
         weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
                                                    score_per_sample=score_per_sample,
-                                                   scorer_name='Annotation Ratio')
+                                                   scorer_name='Annotation Ratio',
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             display_msg = 'Check was unable to find under annotated segments. Try ' \
-                            f'supplying more {self.segment_by}.'
+                          f'supplying more {self.segment_by}.'
             return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
@@ -239,7 +241,7 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         Properties to check, if none are given checks all properties except ignored ones.
     ignore_properties : Union[Hashable, List[Hashable]] , default: None
         Properties to ignore, if none given checks based on properties variable
-    n_top_properties : int , default: 10
+    n_top_properties : Optional[int] , default: 10
         Number of properties to use for segment search. Top properties are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -250,16 +252,20 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: int = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -269,6 +275,7 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -290,7 +297,7 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         Columns to check, if none are given checks all columns except ignored ones.
     ignore_columns : Union[Hashable, List[Hashable]] , default: None
         Columns to ignore, if none given checks based on columns variable
-    n_top_columns : int , default: 10
+    n_top_columns : Optional[int] , default: 10
         Number of features to use for segment search. Top columns are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -301,16 +308,20 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+     multiple_segments_per_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: int = 10,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -320,4 +331,5 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_column,
                          **kwargs)
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -37,7 +37,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
                  segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
                  score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
-                 categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
+                 categorical_aggregation_threshold: float, n_to_show: int,
+                 multiple_segments_per_feature: bool = False, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -49,6 +50,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.score_per_sample = score_per_sample
         self.alternative_scorer = alternative_scorer if alternative_scorer else None
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -107,7 +109,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
                                                    label_col=pd.Series(original_label, index=score_per_sample.index),
                                                    feature_rank_for_search=np.asarray(encoded_dataset.features),
-                                                   dummy_model=dummy_model, scorer=scorer)
+                                                   dummy_model=dummy_model, scorer=scorer,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
@@ -169,18 +172,22 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: Optional[int] = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -192,6 +199,7 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -235,18 +243,22 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: Optional[int] = 15,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -258,4 +270,5 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_column,
                          **kwargs)
diff --git a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py
@@ -10,7 +10,7 @@
 #
 """Module of weak segments performance check."""
 import warnings
-from typing import TYPE_CHECKING, Callable, Dict, List, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -49,7 +49,7 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
         Columns to check, if none are given checks all columns except ignored ones.
     ignore_columns : Union[Hashable, List[Hashable]] , default: None
         Columns to ignore, if none given checks based on columns variable
-    n_top_features : int , default: 5
+    n_top_features : Optional[int] , default: 10
         Number of features to use for segment search. Top columns are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -73,13 +73,16 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
     random_state : int, default: 42
         random seed for all check internals.
+    multiple_segments_per_feature : bool , default: True
+        If True, will allow the same feature to be a segmenting feature in multiple segments,
+        otherwise each feature can appear in one segment at most.
     """
 
     def __init__(
             self,
             columns: Union[Hashable, List[Hashable], None] = None,
             ignore_columns: Union[Hashable, List[Hashable], None] = None,
-            n_top_features: int = 5,
+            n_top_features: Optional[int] = 10,
             segment_minimum_size_ratio: float = 0.05,
             alternative_scorer: Dict[str, Union[str, Callable]] = None,
             loss_per_sample: Union[np.ndarray, pd.Series, None] = None,
@@ -88,6 +91,7 @@ def __init__(
             categorical_aggregation_threshold: float = 0.05,
             n_to_show: int = 3,
             random_state: int = 42,
+            multiple_segments_per_feature: bool = True,
             **kwargs
     ):
         super().__init__(**kwargs)
@@ -108,6 +112,7 @@ def __init__(
         self.loss_per_sample = loss_per_sample
         self.alternative_scorer = alternative_scorer
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -160,7 +165,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
                                                    label_col=dataset_subset.label_col,
                                                    feature_rank_for_search=feature_rank,
-                                                   dummy_model=dummy_model, scorer=scorer)
+                                                   dummy_model=dummy_model, scorer=scorer,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '

diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -170,8 +170,8 @@ def _create_heatmap_display(self, data: pd.DataFrame,
     def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
                               label_col: Optional[pd.Series] = None,
                               feature_rank_for_search: Optional[np.ndarray] = None,
-                              dummy_model: Optional[_DummyModel] = None,
-                              scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \
+                              dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None,
+                              scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \
             -> pd.DataFrame:
         """Search for weak segments based on scorer."""
         # Remove samples with NaN score per sample
@@ -213,11 +213,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
                                                              tuple(filters[feature2]), data_size,
                                                              list(data_of_segment.index)]
 
-        # Drop duplicates without considering column 'Samples in Segment'
-        result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
-        result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
+        # Sort and drop relevant columns
+        weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
+        if multiple_segments_per_feature:
+            result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
+            result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
+        else:
+            used_features = set()
+            result = pd.DataFrame(columns=weak_segments.columns)
+            for _, row in weak_segments.iterrows():
+                if row['Feature1'] in used_features or row['Feature2'] in used_features:
+                    continue
 
-        return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
+                result.loc[len(result)] = row
+                used_features.add(row['Feature1'])
+                if row['Feature2'] != '':
+                    used_features.add(row['Feature2'])
+
+        return result
 
     def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
                            label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,