Merge branch 'main' into 0.17.x

deepchecks · Jul 27, 2023 · 5b0f2e2 · 5b0f2e2
2 parents 3adfe0a + fcd084e
commit 5b0f2e2
Show file tree

Hide file tree

Showing 52 changed files with 1,672 additions and 466 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -128,7 +128,7 @@ jobs:
       with:
         requirements: 'requirements-all.txt'
         fail: 'Copyleft,Other,Error'
-        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
+        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.7\.22|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
         # pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
         # debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
         # certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE

diff --git a/deepchecks/analytics/anonymous_telemetry.py b/deepchecks/analytics/anonymous_telemetry.py
@@ -47,5 +47,6 @@ def validate_latest_version():
                                      ' Deepchecks is frequently updated with major improvements. You should consider '
                                      'upgrading via the "python -m pip install --upgrade deepchecks" command.',
                                      deepchecks.__version__)
+                os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True'  # to ignore joblib
         except Exception:  # pylint: disable=broad-except
             pass
diff --git a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py
@@ -88,6 +88,8 @@ def _get_labels(self, dataset):
             labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
         elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
             labels = dataset.label
+        elif dataset.task_type is TaskType.OTHER:
+            raise DeepchecksValueError('Check is irrelevant when task type is not specified')
         else:
             raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
         return labels

diff --git a/deepchecks/nlp/checks/data_integrity/special_characters.py b/deepchecks/nlp/checks/data_integrity/special_characters.py
@@ -102,6 +102,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                 continue
             if len(sample) > self.max_chars_to_review_per_sample:
                 sample = random.sample(sample, self.max_chars_to_review_per_sample)
+            if len(sample) == 0:
+                percent_special_chars_in_sample[idx] = 0
+                continue
             special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
             percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)
             for char in frozenset(special_chars_in_sample):

diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -216,6 +216,8 @@ def condition(result: t.Dict[str, t.Any]):
             for property_name, info in result.items():
                 if properties_to_ignore is not None and property_name in properties_to_ignore:
                     continue
+                if isinstance(info, str):
+                    continue
                 if info['outlier_ratio'] > threshold:
                     failed_properties.append(property_name)
                 if info['outlier_ratio'] > worst_ratio:

diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module of the under annotated segments check."""
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -18,7 +18,7 @@
 from deepchecks import ConditionCategory, ConditionResult
 from deepchecks.core import CheckResult
 from deepchecks.core.check_result import DisplayMap
-from deepchecks.core.errors import DeepchecksProcessError
+from deepchecks.core.errors import NotEnoughSamplesError
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp.utils.text import break_to_lines_and_trim
 from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
@@ -30,15 +30,19 @@
 __all__ = ['UnderAnnotatedMetaDataSegments', 'UnderAnnotatedPropertySegments']
 
 MAX_SAMPLES_IN_FIGURE = 1000
+# The threshold the UnderAnnotatedSegments considers the data to be well
+# annotated and skips the checks
+ANNOTATION_RATIO_THRESHOLD = 95.0
+MIN_TEXT_SAMPLES = 10  # Min samples to calculate under annotated segments
 
 
 class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
     """Check for under annotated data segments."""
 
     def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
-                 segment_minimum_size_ratio: float, n_samples: int,
-                 categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
+                 segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
+                 categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -48,6 +52,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.n_samples = n_samples
         self.n_to_show = n_to_show
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -59,18 +65,30 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                                                          n_top_features=self.n_top_features)
 
         score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index)
+        annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2)
+        if annotation_ratio > self.annotation_ratio_threshold:
+            display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \
+                          f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \
+                          'annotation_ratio_threshold parameter.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
+
+        if text_data.n_samples < MIN_TEXT_SAMPLES:
+            raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} '
+                                        'segments. Minimum 10 samples required.')
+
         encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,
                                                                            cat_features)
 
         avg_score = round(score_per_sample.mean(), 3)
         weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
                                                    score_per_sample=score_per_sample,
-                                                   scorer_name='Annotation Ratio')
+                                                   scorer_name='Annotation Ratio',
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
-            raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
-                                         'your data is well annotated. If this is not the case, try increasing '
-                                         f'n_samples or supply more {self.segment_by}.')
+            display_msg = 'Check was unable to find under annotated segments. Try ' \
+                          f'supplying more {self.segment_by}.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
         display_msg = f'Showcasing intersections of {self.segment_by} that result in the most ' \
@@ -223,7 +241,7 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         Properties to check, if none are given checks all properties except ignored ones.
     ignore_properties : Union[Hashable, List[Hashable]] , default: None
         Properties to ignore, if none given checks based on properties variable
-    n_top_properties : int , default: 10
+    n_top_properties : Optional[int] , default: 10
         Number of properties to use for segment search. Top properties are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -234,16 +252,20 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: int = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -253,6 +275,7 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -274,7 +297,7 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         Columns to check, if none are given checks all columns except ignored ones.
     ignore_columns : Union[Hashable, List[Hashable]] , default: None
         Columns to ignore, if none given checks based on columns variable
-    n_top_columns : int , default: 10
+    n_top_columns : Optional[int] , default: 10
         Number of features to use for segment search. Top columns are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -285,16 +308,20 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+     multiple_segments_per_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: int = 10,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -304,4 +331,5 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_column,
                          **kwargs)
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -17,7 +17,7 @@
 
 from deepchecks.core import CheckResult
 from deepchecks.core.check_result import DisplayMap
-from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
+from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
 from deepchecks.tabular.context import _DummyModel
@@ -27,6 +27,8 @@
 
 __all__ = ['MetadataSegmentsPerformance', 'PropertySegmentsPerformance']
 
+MIN_TEXT_SAMPLES = 10  # Min samples to calculate weak segments performance
+
 
 class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract):
     """Check the performance of the model on different segments of the data."""
@@ -35,7 +37,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
                  segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
                  score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
-                 categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
+                 categorical_aggregation_threshold: float, n_to_show: int,
+                 multiple_segments_per_feature: bool = False, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -47,6 +50,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.score_per_sample = score_per_sample
         self.alternative_scorer = alternative_scorer if alternative_scorer else None
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -55,6 +59,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         text_data = context.get_data_by_kind(dataset_kind)
         text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)
 
+        if text_data.n_samples < MIN_TEXT_SAMPLES:
+            raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.'
+                                        f' Minimum {MIN_TEXT_SAMPLES} samples required.')
         features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
                                                          columns=self.columns, ignore_columns=self.ignore_columns,
                                                          n_top_features=self.n_top_features)
@@ -102,11 +109,13 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
                                                    label_col=pd.Series(original_label, index=score_per_sample.index),
                                                    feature_rank_for_search=np.asarray(encoded_dataset.features),
-                                                   dummy_model=dummy_model, scorer=scorer)
+                                                   dummy_model=dummy_model, scorer=scorer,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
-            raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
-                                         f'segments. Try increasing n_samples or supply more {self.segment_by}.')
+            display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
+                          f'Try supplying additional {self.segment_by}.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         if context.with_display:
             display = self._create_heatmap_display(data=encoded_dataset.data, weak_segments=weak_segments,
@@ -163,18 +172,22 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: Optional[int] = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -186,6 +199,7 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -229,18 +243,22 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: Optional[int] = 15,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -252,4 +270,5 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_column,
                          **kwargs)