Skip to content

Commit

Permalink
Nb/feat/under annotated checks (#2505)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nadav-Barak committed May 10, 2023
1 parent 8e73fe2 commit 45bc883
Show file tree
Hide file tree
Showing 11 changed files with 489 additions and 20 deletions.
5 changes: 4 additions & 1 deletion deepchecks/nlp/checks/__init__.py
Expand Up @@ -11,7 +11,8 @@
"""Module importing all nlp checks."""

from deepchecks.nlp.checks.data_integrity import (ConflictingLabels, PropertyLabelCorrelation, SpecialCharacters,
TextDuplicates, TextPropertyOutliers, UnknownTokens)
TextDuplicates, TextPropertyOutliers, UnderAnnotatedMetaDataSegments,
UnderAnnotatedPropertySegments, UnknownTokens)
from deepchecks.nlp.checks.model_evaluation import (ConfusionMatrixReport, MetadataSegmentsPerformance, PredictionDrift,
PropertySegmentsPerformance, SingleDatasetPerformance,
TrainTestPerformance)
Expand All @@ -25,6 +26,8 @@
'ConflictingLabels',
'SpecialCharacters',
'UnknownTokens',
'UnderAnnotatedMetaDataSegments',
'UnderAnnotatedPropertySegments',

# Model Evaluation
'SingleDatasetPerformance',
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/nlp/checks/data_integrity/__init__.py
Expand Up @@ -15,6 +15,7 @@
from .special_characters import SpecialCharacters
from .text_duplicates import TextDuplicates
from .text_property_outliers import TextPropertyOutliers
from .under_annotated_segments import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
from .unknown_tokens import UnknownTokens

__all__ = [
Expand All @@ -24,4 +25,6 @@
'ConflictingLabels',
'SpecialCharacters',
'UnknownTokens',
'UnderAnnotatedMetaDataSegments',
'UnderAnnotatedPropertySegments',
]
308 changes: 308 additions & 0 deletions deepchecks/nlp/checks/data_integrity/under_annotated_segments.py

Large diffs are not rendered by default.

Expand Up @@ -54,7 +54,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
context.raise_if_token_classification_task(self)

text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)
text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)

features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
columns=self.columns, ignore_columns=self.ignore_columns,
Expand Down Expand Up @@ -132,7 +132,8 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
weakest segments in the data distribution for further improvement and visibility purposes.
The segments are based on the text properties - which are features extracted from the text, such as "language" and
"number of words".
"number of words". For more on properties, see the `NLP Properties Guide
<https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_properties.html>`_.
In order to achieve this, the check trains several simple tree based models which try to predict the error of the
user provided model on the dataset. The relevant segments are detected by analyzing the different
Expand Down Expand Up @@ -197,7 +198,8 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
weakest segments in the data distribution for further improvement and visibility purposes.
The segments are based on the metadata - which is data that is not part of the text, but is related to it,
such as "user_id" and "user_age".
such as "user_id" and "user_age". For more on metadata, see the `NLP Metadata Guide
<https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_metadata.html>`_.
In order to achieve this, the check trains several simple tree based models which try to predict the error of the
user provided model on the dataset. The relevant segments are detected by analyzing the different
Expand Down
7 changes: 5 additions & 2 deletions deepchecks/nlp/suites/default_suites.py
Expand Up @@ -19,7 +19,8 @@
from deepchecks.nlp.checks import (ConflictingLabels, LabelDrift, MetadataSegmentsPerformance, PredictionDrift,
PropertyDrift, PropertyLabelCorrelation, PropertySegmentsPerformance,
SpecialCharacters, TextDuplicates, TextPropertyOutliers, TrainTestPerformance,
TrainTestSamplesMix, UnknownTokens)
TrainTestSamplesMix, UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments,
UnknownTokens)

__all__ = ['data_integrity', 'train_test_validation',
'model_evaluation', 'full_suite']
Expand Down Expand Up @@ -63,7 +64,9 @@ def data_integrity(n_samples: int = None,
TextDuplicates(**kwargs).add_condition_ratio_less_or_equal(),
ConflictingLabels(**kwargs).add_condition_ratio_of_conflicting_labels_less_or_equal(),
SpecialCharacters(**kwargs).add_condition_ratio_of_samples_with_special_characters_less_or_equal(),
UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal()
UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal(),
UnderAnnotatedPropertySegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
UnderAnnotatedMetaDataSegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
)


Expand Down
3 changes: 2 additions & 1 deletion deepchecks/nlp/text_data.py
Expand Up @@ -23,6 +23,7 @@
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.nlp.utils.text_properties import calculate_default_properties
from deepchecks.utils.logger import get_logger
from deepchecks.utils.metrics import is_label_none
from deepchecks.utils.validation import is_sequence_not_str

__all__ = ['TextData']
Expand Down Expand Up @@ -244,7 +245,7 @@ def sample(self: TDataset, n_samples: int, replace: bool = False, random_state:
"""
samples = np.arange(len(self))
if drop_na_label and self.has_label():
samples = samples[pd.notnull(self._label)]
samples = samples[[not is_label_none(x) for x in self._label]]
n_samples = min(n_samples, len(samples))

np.random.seed(random_state)
Expand Down
33 changes: 23 additions & 10 deletions deepchecks/utils/abstracts/weak_segment_abstract.py
Expand Up @@ -64,8 +64,12 @@ def _target_encode_categorical_features_fill_na(self, data: pd.DataFrame, label_
else:
label_as_int = pd.cut(label_col.astype('float64').fillna(label_col.mean()), bins=10, labels=False)
df_encoded = t_encoder.fit_transform(df_aggregated, pd.Series(label_as_int, index=df_aggregated.index))
# Convert categorical features to ordinal based on their encoded values and store the mapping
for col in cat_features:
values_mapping[col] = pd.concat([df_encoded[col], df_aggregated[col]], axis=1).drop_duplicates()
df_encoded[col] = df_encoded[col].apply(sorted(df_encoded[col].unique()).index)
mapping = pd.concat([df_encoded[col], df_aggregated[col]], axis=1).drop_duplicates()
mapping.columns = ['encoded_value', 'original_category']
values_mapping[col] = mapping.sort_values(by='encoded_value')
else:
df_encoded = df_aggregated
self.encoder_mapping = values_mapping
Expand Down Expand Up @@ -145,11 +149,11 @@ def _create_heatmap_display(self, data: pd.DataFrame,
fig = px.imshow(scores, x=f1_labels, y=f2_labels, labels=labels, color_continuous_scale='rdylgn')
fig.update_traces(text=scores_text, texttemplate='%{text}')
if segment['Feature2']:
title = f'{score_title} (percent of data) {segment["Feature1"]} vs {segment["Feature2"]}'
title = f'{score_title} (percent of data)'
tab_name = f'{segment["Feature1"]} vs {segment["Feature2"]}'
else:
title = f'{score_title} (percent of data) {segment["Feature1"]}'
tab_name = f'{segment["Feature1"]}'
title = f'{score_title} (percent of data)'
tab_name = segment['Feature1']
fig.update_layout(
title=title,
height=600,
Expand Down Expand Up @@ -178,8 +182,9 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
feature_rank_for_search = np.asarray(data.columns)

weak_segments = pd.DataFrame(
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range',
'% of Data', 'Samples in Segment'])
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None \
else len(feature_rank_for_search)
for i in range(n_features):
for j in range(i + 1, n_features):
Expand All @@ -189,18 +194,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
dummy_model, scorer)
if weak_segment_score is None or len(weak_segment_filter.filters) == 0:
continue
data_size = 100 * weak_segment_filter.filter(data).shape[0] / data.shape[0]
data_of_segment = weak_segment_filter.filter(data)
data_size = round(100 * data_of_segment.shape[0] / data.shape[0], 2)
filters = weak_segment_filter.filters
if len(filters.keys()) == 1:
weak_segments.loc[len(weak_segments)] = [weak_segment_score, list(filters.keys())[0],
tuple(list(filters.values())[0]), '',
None, data_size]
None, data_size, list(data_of_segment.index)]
else:
weak_segments.loc[len(weak_segments)] = [weak_segment_score, feature1,
tuple(filters[feature1]), feature2,
tuple(filters[feature2]), data_size]
tuple(filters[feature2]), data_size,
list(data_of_segment.index)]

return weak_segments.drop_duplicates().sort_values(score_title).reset_index(drop=True)
# Drop duplicates without considering column 'Samples in Segment'
result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']

return result_no_duplicates.sort_values(score_title).reset_index(drop=True)

def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
Expand Down Expand Up @@ -267,6 +278,8 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam
"""Format partition vector for display. If seperator is None returns a list instead of a string."""
if feature_name == '':
return ['']
if not isinstance(partition_vec, np.ndarray):
partition_vec = np.asarray(partition_vec)

result = []
if feature_name in self.encoder_mapping.keys():
Expand Down
11 changes: 10 additions & 1 deletion deepchecks/utils/metrics.py
Expand Up @@ -12,9 +12,10 @@
from typing import Union

import numpy as np
import pandas as pd
from sklearn.metrics._scorer import _BaseScorer

__all__ = ['get_gain', 'get_scorer_name', 'averaging_mechanism']
__all__ = ['get_gain', 'get_scorer_name', 'averaging_mechanism', 'is_label_none']

from deepchecks.core.errors import DeepchecksValueError

Expand Down Expand Up @@ -85,3 +86,11 @@ def averaging_mechanism(averaging_method: str, scores_per_class, weights=None) -
return np.multiply(scores_per_class, weights).sum() / sum(weights)
else:
raise DeepchecksValueError(f'Unknown averaging {averaging_method}')


def is_label_none(label):
"""Check if label (single label of a sample) is None."""
result = pd.isnull(label)
if isinstance(result, bool):
return result
return any(result)
2 changes: 1 addition & 1 deletion deepchecks/utils/single_sample_metrics.py
Expand Up @@ -36,7 +36,7 @@ def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray,
if model_classes is not None:
if any(x not in model_classes for x in labels):
raise DeepchecksValueError(
f'Label observed values {sorted(labels.unique())} contain values '
f'Label observed values {sorted(np.unique(labels))} contain values '
f'that are not found in the model classes: {model_classes}.')
if probas.shape[1] != len(model_classes):
raise DeepchecksValueError(
Expand Down
97 changes: 97 additions & 0 deletions tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -0,0 +1,97 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Test for the NLP UnderAnnotatedSegments check"""
import numpy as np
from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises

from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
from tests.base.utils import equal_condition_result


def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
dtype=object)

check = UnderAnnotatedPropertySegments().add_condition_segments_annotation_ratio_greater_than(0.5)
# Act
result = check.run(test)
condition_result = check.conditions_decision(result)

# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details=r'Most under annotated segment has annotation ratio of 31.43%.',
name=r'In all segments annotation ratio should be greater than 50%.')
))

assert_that(result.value['avg_score'], close_to(0.5, 0.001))
assert_that(len(result.value['weak_segments_list']), close_to(33, 1))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.314, 0.01))


def test_tweet_emotion_metadata(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata

test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
dtype=object)
check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
# Act
result = check.run(test)
condition_result = check.conditions_decision(result)

# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details='Found a segment with annotation ratio of 0.366 in comparison to an average score of 0.5 in sampled data.',
name='The relative performance of weakest segment is greater than 80% of average model performance.')
))

assert_that(result.value['avg_score'], close_to(0.5, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(5))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.366, 0.01))
assert_that(result.value['weak_segments_list'].iloc[0, 1], equal_to('user_age'))


def test_tweet_emotion_metadata_interesting_segment(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata

idx_to_change = test.metadata[(test.metadata['user_age'] > 30) & (test.metadata['user_region'] == 'Europe')].index
label = test._label.copy().astype(object)
label[idx_to_change] = None
test._label = label

# Act
result = UnderAnnotatedMetaDataSegments().run(test)

# Assert
assert_that(result.value['avg_score'], close_to(0.844, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(6))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0, 0.01))
assert_that(result.value['weak_segments_list'].iloc[0, 1], equal_to('user_age'))


def test_tweet_emotion_metadata_fully_annotated(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()

# Act & Assert
assert_that(
calling(check.run).with_args(test),
raises(DeepchecksProcessError, 'Check was unable to find under annotated segments. This is expected if '
'your data is well annotated. If this is not the case, try increasing '
'n_samples or supply more metadata.')
)
32 changes: 31 additions & 1 deletion tests/utils/metrics_test.py
Expand Up @@ -11,7 +11,7 @@
"""Test metrics utils"""
import pandas as pd
from hamcrest import assert_that, calling, close_to, has_entries, is_, raises
from sklearn.metrics import make_scorer
from sklearn.metrics import make_scorer, log_loss, mean_squared_error

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.tabular import Dataset
Expand All @@ -20,6 +20,7 @@
false_positive_rate_metric,
true_negative_rate_metric)
from deepchecks.tabular.utils.task_inference import get_all_labels, infer_classes_from_model
from deepchecks.utils.single_sample_metrics import calculate_neg_cross_entropy_per_sample, calculate_neg_mse_per_sample
from tests.common import is_nan


Expand Down Expand Up @@ -115,6 +116,21 @@ def test_lending_club_true_negative_rate_scorer_binary(lending_club_split_datase
assert_that(score, close_to(0.767, 0.01))


def test_cross_entropy_lending_club(lending_club_split_dataset_and_model):
# Arrange
_, test_ds, clf = lending_club_split_dataset_and_model
probas = clf.predict_proba(test_ds.features_columns)
eps = 1e-15

# Act
score = calculate_neg_cross_entropy_per_sample(test_ds.label_col, probas, eps=eps)
score_sklearn = log_loss(test_ds.label_col, probas, eps=eps)

# Assert
assert_that(score.mean(), close_to(-1 * 0.524, 0.01))
assert_that(score.mean(), close_to(-1 * score_sklearn, 0.01))


def test_iris_true_negative_rate_scorer_multiclass(iris_split_dataset_and_model):
# Arrange
_, test_ds, clf = iris_split_dataset_and_model
Expand Down Expand Up @@ -200,3 +216,17 @@ def test_scorer_with_only_new_labels_in_data(iris: pd.DataFrame, iris_adaboost):
assert_that(score, has_entries({
0: is_(0), 1: is_(0), 2: is_(0), 19: is_nan(), 20: is_nan()
}))


def test_mse_diabetes(diabetes_split_dataset_and_model):
# Arrange
_, test_ds, clf = diabetes_split_dataset_and_model
preds = clf.predict(test_ds.features_columns)

# Act
score = calculate_neg_mse_per_sample(test_ds.label_col, preds)
score_sklearn = mean_squared_error(test_ds.label_col, preds)

# Assert
assert_that(score.mean(), close_to(-1 * 3296, 1))
assert_that(score.mean(), close_to(-1 * score_sklearn, 0.01))

0 comments on commit 45bc883

Please sign in to comment.