Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nb/feat/under annotated checks #2505

Merged
merged 26 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2bc175d
simplify weak segments
Nadav-Barak May 1, 2023
61034a3
added comments
Nadav-Barak May 2, 2023
d86ff29
set seed
Nadav-Barak May 2, 2023
f2061b7
set seed v2
Nadav-Barak May 2, 2023
6034470
pylint
Nadav-Barak May 2, 2023
5e19e6e
under annotated segments
Nadav-Barak May 2, 2023
4685036
Merge branch 'main' into NB/Feat/under_annotated_segments
Nadav-Barak May 3, 2023
e64b556
merge with main + error message on unknown_tokens.py
Nadav-Barak May 3, 2023
183ac41
refactor in weak segment + bug fix in vision
Nadav-Barak May 3, 2023
48b4223
bin for regression + CR
Nadav-Barak May 4, 2023
6d62c62
Merge branch 'main' into NB/Bug/loss_per_sample
Nadav-Barak May 4, 2023
c43d239
bug fix + test
Nadav-Barak May 4, 2023
ff17ebc
check + tests
Nadav-Barak May 8, 2023
f11d0d2
Merge branch 'main' into NB/Feat/under_annotated_checks
Nadav-Barak May 8, 2023
0d8c2ec
added plot
Nadav-Barak May 8, 2023
5cdfe24
Merge branch 'main' into NB/Feat/under_annotated_checks
Nadav-Barak May 8, 2023
09234c2
merge with main
Nadav-Barak May 8, 2023
3505eed
display for categorical
Nadav-Barak May 9, 2023
6eba061
additional test
Nadav-Barak May 9, 2023
4dfa633
Merge branch 'main' into NB/Feat/under_annotated_checks
Nadav-Barak May 9, 2023
cb6f653
test fix
Nadav-Barak May 9, 2023
5b323b0
Merge branch 'main' into NB/Feat/under_annotated_checks
Nadav-Barak May 9, 2023
2cafb05
test fix
Nadav-Barak May 9, 2023
de7da1c
Update metrics.py
noamzbr May 10, 2023
1202cde
Merge branch 'main' into NB/Feat/under_annotated_checks
noamzbr May 10, 2023
2e4521b
fix isort
noamzbr May 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion deepchecks/nlp/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"""Module importing all nlp checks."""

from deepchecks.nlp.checks.data_integrity import (ConflictingLabels, PropertyLabelCorrelation, SpecialCharacters,
TextDuplicates, TextPropertyOutliers, UnknownTokens)
TextDuplicates, TextPropertyOutliers, UnderAnnotatedMetaDataSegments,
UnderAnnotatedPropertySegments, UnknownTokens)
from deepchecks.nlp.checks.model_evaluation import (ConfusionMatrixReport, MetadataSegmentsPerformance, PredictionDrift,
PropertySegmentsPerformance, SingleDatasetPerformance,
TrainTestPerformance)
Expand All @@ -25,6 +26,8 @@
'ConflictingLabels',
'SpecialCharacters',
'UnknownTokens',
'UnderAnnotatedMetaDataSegments',
'UnderAnnotatedPropertySegments',

# Model Evaluation
'SingleDatasetPerformance',
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/nlp/checks/data_integrity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .special_characters import SpecialCharacters
from .text_duplicates import TextDuplicates
from .text_property_outliers import TextPropertyOutliers
from .under_annotated_segments import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
from .unknown_tokens import UnknownTokens

__all__ = [
Expand All @@ -24,4 +25,6 @@
'ConflictingLabels',
'SpecialCharacters',
'UnknownTokens',
'UnderAnnotatedMetaDataSegments',
'UnderAnnotatedPropertySegments',
]
290 changes: 290 additions & 0 deletions deepchecks/nlp/checks/data_integrity/under_annotated_segments.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
context.raise_if_multi_label_task(self)

text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)
text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)

features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
columns=self.columns, ignore_columns=self.ignore_columns,
Expand Down
26 changes: 16 additions & 10 deletions deepchecks/utils/abstracts/weak_segment_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ def _create_heatmap_display(self, data: pd.DataFrame,
fig = px.imshow(scores, x=f1_labels, y=f2_labels, labels=labels, color_continuous_scale='rdylgn')
fig.update_traces(text=scores_text, texttemplate='%{text}')
if segment['Feature2']:
title = f'{score_title} (percent of data) {segment["Feature1"]} vs {segment["Feature2"]}'
title = f'{score_title} (percent of data)'
tab_name = f'{segment["Feature1"]} vs {segment["Feature2"]}'
else:
title = f'{score_title} (percent of data) {segment["Feature1"]}'
tab_name = f'{segment["Feature1"]}'
title = f'{score_title} (percent of data)'
tab_name = segment['Feature1']
fig.update_layout(
title=title,
height=600,
Expand Down Expand Up @@ -178,8 +178,9 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
feature_rank_for_search = np.asarray(data.columns)

weak_segments = pd.DataFrame(
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range',
'% of Data', 'Samples in Segment'])
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None \
else len(feature_rank_for_search)
for i in range(n_features):
for j in range(i + 1, n_features):
Expand All @@ -189,18 +190,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
dummy_model, scorer)
if weak_segment_score is None or len(weak_segment_filter.filters) == 0:
continue
data_size = 100 * weak_segment_filter.filter(data).shape[0] / data.shape[0]
data_of_segment = weak_segment_filter.filter(data)
data_size = round(100 * data_of_segment.shape[0] / data.shape[0], 2)
filters = weak_segment_filter.filters
if len(filters.keys()) == 1:
weak_segments.loc[len(weak_segments)] = [weak_segment_score, list(filters.keys())[0],
tuple(list(filters.values())[0]), '',
None, data_size]
None, data_size, list(data_of_segment.index)]
else:
weak_segments.loc[len(weak_segments)] = [weak_segment_score, feature1,
tuple(filters[feature1]), feature2,
tuple(filters[feature2]), data_size]
tuple(filters[feature2]), data_size,
list(data_of_segment.index)]

return weak_segments.drop_duplicates().sort_values(score_title).reset_index(drop=True)
# Drop duplicates without considering column 'Samples in Segment'
result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']

return result_no_duplicates.sort_values(score_title).reset_index(drop=True)

def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
Expand Down Expand Up @@ -291,7 +298,6 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam

def _generate_check_result_value(self, weak_segments_df, cat_features: List[str], avg_score: float):
"""Generate a uniform format check result value for the different WeakSegmentsPerformance checks."""
pd.set_option('mode.chained_assignment', None)
Nadav-Barak marked this conversation as resolved.
Show resolved Hide resolved
weak_segments_output = weak_segments_df.copy()
for idx, segment in weak_segments_df.iterrows():
for feature in ['Feature1', 'Feature2']:
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/utils/single_sample_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray, model_cla
if model_classes is not None:
if any(x not in model_classes for x in labels):
raise DeepchecksValueError(
f'Label observed values {sorted(labels.unique())} contain values '
f'Label observed values {sorted(np.unique(labels))} contain values '
Nadav-Barak marked this conversation as resolved.
Show resolved Hide resolved
f'that are not found in the model classes: {model_classes}.')
if probas.shape[1] != len(model_classes):
raise DeepchecksValueError(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Test for the NLP UnderAnnotatedSegments check"""
import numpy as np
from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises

from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
from tests.base.utils import equal_condition_result


def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
dtype=object)

check = UnderAnnotatedPropertySegments().add_condition_segments_annotation_ratio_greater_than(0.5)
# Act
result = check.run(test)
condition_result = check.conditions_decision(result)

# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details=r'Most under annotated segment has annotation ratio of 31.4%.',
name=r'In all segments annotation ratio should be greater than 50%.')
))

assert_that(result.value['avg_score'], close_to(0.5, 0.001))
assert_that(len(result.value['weak_segments_list']), close_to(33, 1))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.314, 0.01))


def test_tweet_emotion_metadata(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata

test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
dtype=object)
check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
# Act
result = check.run(test)
condition_result = check.conditions_decision(result)

# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details='Found a segment with annotation ratio of 0.366 in comparison to an average score of 0.5 in sampled data.',
name='The relative performance of weakest segment is greater than 80% of average model performance.')
))

assert_that(result.value['avg_score'], close_to(0.5, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(5))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.366, 0.01))
assert_that(result.value['weak_segments_list'].iloc[0, 1], equal_to('user_age'))


def test_tweet_emotion_metadata_fully_annotated(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()

# Act & Assert
assert_that(
calling(check.run).with_args(test),
raises(DeepchecksProcessError, 'Check was unable to find under annotated segments. This is expected if '
'your data is well annotated. If this is not the case, try increasing '
Nadav-Barak marked this conversation as resolved.
Show resolved Hide resolved
'n_samples or supply more metadata.')
)
32 changes: 31 additions & 1 deletion tests/utils/metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"""Test metrics utils"""
import pandas as pd
from hamcrest import assert_that, calling, close_to, has_entries, is_, raises
from sklearn.metrics import make_scorer
from sklearn.metrics import make_scorer, log_loss, mean_squared_error

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.tabular import Dataset
Expand All @@ -20,6 +20,7 @@
false_positive_rate_metric,
true_negative_rate_metric)
from deepchecks.tabular.utils.task_inference import get_all_labels, infer_classes_from_model
from deepchecks.utils.single_sample_metrics import calculate_neg_cross_entropy_per_sample, calculate_neg_mse_per_sample
from tests.common import is_nan


Expand Down Expand Up @@ -115,6 +116,21 @@ def test_lending_club_true_negative_rate_scorer_binary(lending_club_split_datase
assert_that(score, close_to(0.767, 0.01))


def test_cross_entropy_lending_club(lending_club_split_dataset_and_model):
# Arrange
_, test_ds, clf = lending_club_split_dataset_and_model
probas = clf.predict_proba(test_ds.features_columns)
eps = 1e-15

# Act
score = calculate_neg_cross_entropy_per_sample(test_ds.label_col, probas, eps=eps)
score_sklearn = log_loss(test_ds.label_col, probas, eps=eps)

# Assert
assert_that(score.mean(), close_to(-0.524, 0.01))
assert_that(score.mean(), close_to(-score_sklearn, 0.01))


def test_iris_true_negative_rate_scorer_multiclass(iris_split_dataset_and_model):
# Arrange
_, test_ds, clf = iris_split_dataset_and_model
Expand Down Expand Up @@ -200,3 +216,17 @@ def test_scorer_with_only_new_labels_in_data(iris: pd.DataFrame, iris_adaboost):
assert_that(score, has_entries({
0: is_(0), 1: is_(0), 2: is_(0), 19: is_nan(), 20: is_nan()
}))


def test_mse_diabetes(diabetes_split_dataset_and_model):
# Arrange
_, test_ds, clf = diabetes_split_dataset_and_model
preds = clf.predict(test_ds.features_columns)

# Act
score = calculate_neg_mse_per_sample(test_ds.label_col, preds)
score_sklearn = mean_squared_error(test_ds.label_col, preds)

# Assert
assert_that(score.mean(), close_to(-3296, 1))
assert_that(score.mean(), close_to(-score_sklearn, 0.01))