Skip to content

Commit

Permalink
Weak segment - return single segment per feature argument (#2645)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nadav-Barak committed Jul 26, 2023
1 parent deb16bc commit daad127
Show file tree
Hide file tree
Showing 16 changed files with 115 additions and 49 deletions.
30 changes: 21 additions & 9 deletions deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Module of the under annotated segments check."""
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -41,8 +41,8 @@ class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):

def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
categorical_aggregation_threshold: float, **kwargs):
segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
super().__init__(**kwargs)
self.segment_by = segment_by
self.columns = columns
Expand All @@ -53,6 +53,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
self.n_to_show = n_to_show
self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
self.multiple_segments_per_feature = multiple_segments_per_feature

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
Expand Down Expand Up @@ -81,11 +82,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
avg_score = round(score_per_sample.mean(), 3)
weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
score_per_sample=score_per_sample,
scorer_name='Annotation Ratio')
scorer_name='Annotation Ratio',
multiple_segments_per_feature=self.multiple_segments_per_feature)

if len(weak_segments) == 0:
display_msg = 'Check was unable to find under annotated segments. Try ' \
f'supplying more {self.segment_by}.'
f'supplying more {self.segment_by}.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
Expand Down Expand Up @@ -239,7 +241,7 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
Properties to check, if none are given checks all properties except ignored ones.
ignore_properties : Union[Hashable, List[Hashable]] , default: None
Properties to ignore, if none given checks based on properties variable
n_top_properties : int , default: 10
n_top_properties : Optional[int] , default: 10
Number of properties to use for segment search. Top properties are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
Expand All @@ -250,16 +252,20 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

def __init__(self,
properties: Union[Hashable, List[Hashable], None] = None,
ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: int = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):
super().__init__(segment_by='properties',
columns=properties,
Expand All @@ -269,6 +275,7 @@ def __init__(self,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)


Expand All @@ -290,7 +297,7 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
Columns to check, if none are given checks all columns except ignored ones.
ignore_columns : Union[Hashable, List[Hashable]] , default: None
Columns to ignore, if none given checks based on columns variable
n_top_columns : int , default: 10
n_top_columns : Optional[int] , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
Expand All @@ -301,16 +308,20 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

def __init__(self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 10,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_column: bool = True,
**kwargs):
super().__init__(segment_by='metadata',
columns=columns,
Expand All @@ -320,4 +331,5 @@ def __init__(self,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_column,
**kwargs)
Expand Up @@ -37,7 +37,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
categorical_aggregation_threshold: float, n_to_show: int,
multiple_segments_per_feature: bool = False, **kwargs):
super().__init__(**kwargs)
self.segment_by = segment_by
self.columns = columns
Expand All @@ -49,6 +50,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
self.score_per_sample = score_per_sample
self.alternative_scorer = alternative_scorer if alternative_scorer else None
self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_feature = multiple_segments_per_feature

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
Expand Down Expand Up @@ -107,7 +109,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
label_col=pd.Series(original_label, index=score_per_sample.index),
feature_rank_for_search=np.asarray(encoded_dataset.features),
dummy_model=dummy_model, scorer=scorer)
dummy_model=dummy_model, scorer=scorer,
multiple_segments_per_feature=self.multiple_segments_per_feature)

if len(weak_segments) == 0:
display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
Expand Down Expand Up @@ -169,18 +172,22 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

def __init__(self,
properties: Union[Hashable, List[Hashable], None] = None,
ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: Optional[int] = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
alternative_scorer: Dict[str, Union[str, Callable]] = None,
score_per_sample: Union[np.ndarray, pd.Series, None] = None,
n_samples: int = 5_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):
super().__init__(segment_by='properties',
columns=properties,
Expand All @@ -192,6 +199,7 @@ def __init__(self,
score_per_sample=score_per_sample,
alternative_scorer=alternative_scorer,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)


Expand Down Expand Up @@ -235,18 +243,22 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

def __init__(self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: Optional[int] = 15,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
alternative_scorer: Dict[str, Union[str, Callable]] = None,
score_per_sample: Union[np.ndarray, pd.Series, None] = None,
n_samples: int = 5_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_column: bool = True,
**kwargs):
super().__init__(segment_by='metadata',
columns=columns,
Expand All @@ -258,4 +270,5 @@ def __init__(self,
score_per_sample=score_per_sample,
alternative_scorer=alternative_scorer,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_column,
**kwargs)
Expand Up @@ -10,7 +10,7 @@
#
"""Module of weak segments performance check."""
import warnings
from typing import TYPE_CHECKING, Callable, Dict, List, Union
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -49,7 +49,7 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
Columns to check, if none are given checks all columns except ignored ones.
ignore_columns : Union[Hashable, List[Hashable]] , default: None
Columns to ignore, if none given checks based on columns variable
n_top_features : int , default: 5
n_top_features : Optional[int] , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
Expand All @@ -73,13 +73,16 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
random_state : int, default: 42
random seed for all check internals.
multiple_segments_per_feature : bool , default: True
If True, will allow the same feature to be a segmenting feature in multiple segments,
otherwise each feature can appear in one segment at most.
"""

def __init__(
self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_features: int = 5,
n_top_features: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
alternative_scorer: Dict[str, Union[str, Callable]] = None,
loss_per_sample: Union[np.ndarray, pd.Series, None] = None,
Expand All @@ -88,6 +91,7 @@ def __init__(
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
random_state: int = 42,
multiple_segments_per_feature: bool = True,
**kwargs
):
super().__init__(**kwargs)
Expand All @@ -108,6 +112,7 @@ def __init__(
self.loss_per_sample = loss_per_sample
self.alternative_scorer = alternative_scorer
self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_feature = multiple_segments_per_feature

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
Expand Down Expand Up @@ -160,7 +165,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
label_col=dataset_subset.label_col,
feature_rank_for_search=feature_rank,
dummy_model=dummy_model, scorer=scorer)
dummy_model=dummy_model, scorer=scorer,
multiple_segments_per_feature=self.multiple_segments_per_feature)

if len(weak_segments) == 0:
raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
Expand Down
25 changes: 19 additions & 6 deletions deepchecks/utils/abstracts/weak_segment_abstract.py
Expand Up @@ -170,8 +170,8 @@ def _create_heatmap_display(self, data: pd.DataFrame,
def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
label_col: Optional[pd.Series] = None,
feature_rank_for_search: Optional[np.ndarray] = None,
dummy_model: Optional[_DummyModel] = None,
scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \
dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None,
scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \
-> pd.DataFrame:
"""Search for weak segments based on scorer."""
# Remove samples with NaN score per sample
Expand Down Expand Up @@ -213,11 +213,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
tuple(filters[feature2]), data_size,
list(data_of_segment.index)]

# Drop duplicates without considering column 'Samples in Segment'
result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
# Sort and drop relevant columns
weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
if multiple_segments_per_feature:
result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
else:
used_features = set()
result = pd.DataFrame(columns=weak_segments.columns)
for _, row in weak_segments.iterrows():
if row['Feature1'] in used_features or row['Feature2'] in used_features:
continue

return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
result.loc[len(result)] = row
used_features.add(row['Feature1'])
if row['Feature2'] != '':
used_features.add(row['Feature2'])

return result

def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
Expand Down

0 comments on commit daad127

Please sign in to comment.