Skip to content

Commit

Permalink
Merge branch 'main' into 0.17.x
Browse files Browse the repository at this point in the history
  • Loading branch information
noamzbr committed Jul 27, 2023
2 parents 3adfe0a + fcd084e commit 5b0f2e2
Show file tree
Hide file tree
Showing 52 changed files with 1,672 additions and 466 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Expand Up @@ -128,7 +128,7 @@ jobs:
with:
requirements: 'requirements-all.txt'
fail: 'Copyleft,Other,Error'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.7\.22|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
# pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
# debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
# certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE
Expand Down
1 change: 1 addition & 0 deletions deepchecks/analytics/anonymous_telemetry.py
Expand Up @@ -47,5 +47,6 @@ def validate_latest_version():
' Deepchecks is frequently updated with major improvements. You should consider '
'upgrading via the "python -m pip install --upgrade deepchecks" command.',
deepchecks.__version__)
os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True' # to ignore joblib
except Exception: # pylint: disable=broad-except
pass
2 changes: 2 additions & 0 deletions deepchecks/nlp/checks/data_integrity/conflicting_labels.py
Expand Up @@ -88,6 +88,8 @@ def _get_labels(self, dataset):
labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
labels = dataset.label
elif dataset.task_type is TaskType.OTHER:
raise DeepchecksValueError('Check is irrelevant when task type is not specified')
else:
raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
return labels
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/nlp/checks/data_integrity/special_characters.py
Expand Up @@ -102,6 +102,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
continue
if len(sample) > self.max_chars_to_review_per_sample:
sample = random.sample(sample, self.max_chars_to_review_per_sample)
if len(sample) == 0:
percent_special_chars_in_sample[idx] = 0
continue
special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)
for char in frozenset(special_chars_in_sample):
Expand Down
Expand Up @@ -216,6 +216,8 @@ def condition(result: t.Dict[str, t.Any]):
for property_name, info in result.items():
if properties_to_ignore is not None and property_name in properties_to_ignore:
continue
if isinstance(info, str):
continue
if info['outlier_ratio'] > threshold:
failed_properties.append(property_name)
if info['outlier_ratio'] > worst_ratio:
Expand Down
52 changes: 40 additions & 12 deletions deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Module of the under annotated segments check."""
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -18,7 +18,7 @@
from deepchecks import ConditionCategory, ConditionResult
from deepchecks.core import CheckResult
from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp.utils.text import break_to_lines_and_trim
from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
Expand All @@ -30,15 +30,19 @@
__all__ = ['UnderAnnotatedMetaDataSegments', 'UnderAnnotatedPropertySegments']

MAX_SAMPLES_IN_FIGURE = 1000
# The threshold the UnderAnnotatedSegments considers the data to be well
# annotated and skips the checks
ANNOTATION_RATIO_THRESHOLD = 95.0
MIN_TEXT_SAMPLES = 10 # Min samples to calculate under annotated segments


class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
"""Check for under annotated data segments."""

def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
segment_minimum_size_ratio: float, n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
super().__init__(**kwargs)
self.segment_by = segment_by
self.columns = columns
Expand All @@ -48,6 +52,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
self.n_samples = n_samples
self.n_to_show = n_to_show
self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
self.multiple_segments_per_feature = multiple_segments_per_feature

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
Expand All @@ -59,18 +65,30 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
n_top_features=self.n_top_features)

score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index)
annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2)
if annotation_ratio > self.annotation_ratio_threshold:
display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \
f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \
'annotation_ratio_threshold parameter.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

if text_data.n_samples < MIN_TEXT_SAMPLES:
raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} '
'segments. Minimum 10 samples required.')

encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,
cat_features)

avg_score = round(score_per_sample.mean(), 3)
weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
score_per_sample=score_per_sample,
scorer_name='Annotation Ratio')
scorer_name='Annotation Ratio',
multiple_segments_per_feature=self.multiple_segments_per_feature)

if len(weak_segments) == 0:
raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
'your data is well annotated. If this is not the case, try increasing '
f'n_samples or supply more {self.segment_by}.')
display_msg = 'Check was unable to find under annotated segments. Try ' \
f'supplying more {self.segment_by}.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
display_msg = f'Showcasing intersections of {self.segment_by} that result in the most ' \
Expand Down Expand Up @@ -223,7 +241,7 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
Properties to check, if none are given checks all properties except ignored ones.
ignore_properties : Union[Hashable, List[Hashable]] , default: None
Properties to ignore, if none given checks based on properties variable
n_top_properties : int , default: 10
n_top_properties : Optional[int] , default: 10
Number of properties to use for segment search. Top properties are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
Expand All @@ -234,16 +252,20 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

def __init__(self,
properties: Union[Hashable, List[Hashable], None] = None,
ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: int = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):
super().__init__(segment_by='properties',
columns=properties,
Expand All @@ -253,6 +275,7 @@ def __init__(self,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)


Expand All @@ -274,7 +297,7 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
Columns to check, if none are given checks all columns except ignored ones.
ignore_columns : Union[Hashable, List[Hashable]] , default: None
Columns to ignore, if none given checks based on columns variable
n_top_columns : int , default: 10
n_top_columns : Optional[int] , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
Expand All @@ -285,16 +308,20 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

def __init__(self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 10,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_column: bool = True,
**kwargs):
super().__init__(segment_by='metadata',
columns=columns,
Expand All @@ -304,4 +331,5 @@ def __init__(self,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_column,
**kwargs)
Expand Up @@ -17,7 +17,7 @@

from deepchecks.core import CheckResult
from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
from deepchecks.tabular.context import _DummyModel
Expand All @@ -27,6 +27,8 @@

__all__ = ['MetadataSegmentsPerformance', 'PropertySegmentsPerformance']

MIN_TEXT_SAMPLES = 10 # Min samples to calculate weak segments performance


class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract):
"""Check the performance of the model on different segments of the data."""
Expand All @@ -35,7 +37,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
categorical_aggregation_threshold: float, n_to_show: int,
multiple_segments_per_feature: bool = False, **kwargs):
super().__init__(**kwargs)
self.segment_by = segment_by
self.columns = columns
Expand All @@ -47,6 +50,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
self.score_per_sample = score_per_sample
self.alternative_scorer = alternative_scorer if alternative_scorer else None
self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_feature = multiple_segments_per_feature

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
Expand All @@ -55,6 +59,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)

if text_data.n_samples < MIN_TEXT_SAMPLES:
raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.'
f' Minimum {MIN_TEXT_SAMPLES} samples required.')
features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
columns=self.columns, ignore_columns=self.ignore_columns,
n_top_features=self.n_top_features)
Expand Down Expand Up @@ -102,11 +109,13 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
label_col=pd.Series(original_label, index=score_per_sample.index),
feature_rank_for_search=np.asarray(encoded_dataset.features),
dummy_model=dummy_model, scorer=scorer)
dummy_model=dummy_model, scorer=scorer,
multiple_segments_per_feature=self.multiple_segments_per_feature)

if len(weak_segments) == 0:
raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
f'segments. Try increasing n_samples or supply more {self.segment_by}.')
display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
f'Try supplying additional {self.segment_by}.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

if context.with_display:
display = self._create_heatmap_display(data=encoded_dataset.data, weak_segments=weak_segments,
Expand Down Expand Up @@ -163,18 +172,22 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

def __init__(self,
properties: Union[Hashable, List[Hashable], None] = None,
ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: Optional[int] = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
alternative_scorer: Dict[str, Union[str, Callable]] = None,
score_per_sample: Union[np.ndarray, pd.Series, None] = None,
n_samples: int = 5_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):
super().__init__(segment_by='properties',
columns=properties,
Expand All @@ -186,6 +199,7 @@ def __init__(self,
score_per_sample=score_per_sample,
alternative_scorer=alternative_scorer,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)


Expand Down Expand Up @@ -229,18 +243,22 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

def __init__(self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: Optional[int] = 15,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,
alternative_scorer: Dict[str, Union[str, Callable]] = None,
score_per_sample: Union[np.ndarray, pd.Series, None] = None,
n_samples: int = 5_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
multiple_segments_column: bool = True,
**kwargs):
super().__init__(segment_by='metadata',
columns=columns,
Expand All @@ -252,4 +270,5 @@ def __init__(self,
score_per_sample=score_per_sample,
alternative_scorer=alternative_scorer,
categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_column,
**kwargs)

0 comments on commit 5b0f2e2

Please sign in to comment.