Nb/feat/under annotated checks (#2505)

deepchecks · May 10, 2023 · 45bc883 · 45bc883
1 parent 8e73fe2
commit 45bc883
Show file tree

Hide file tree

Showing 11 changed files with 489 additions and 20 deletions.
diff --git a/deepchecks/nlp/checks/__init__.py b/deepchecks/nlp/checks/__init__.py
@@ -11,7 +11,8 @@
 """Module importing all nlp checks."""
 
 from deepchecks.nlp.checks.data_integrity import (ConflictingLabels, PropertyLabelCorrelation, SpecialCharacters,
-                                                  TextDuplicates, TextPropertyOutliers, UnknownTokens)
+                                                  TextDuplicates, TextPropertyOutliers, UnderAnnotatedMetaDataSegments,
+                                                  UnderAnnotatedPropertySegments, UnknownTokens)
 from deepchecks.nlp.checks.model_evaluation import (ConfusionMatrixReport, MetadataSegmentsPerformance, PredictionDrift,
                                                     PropertySegmentsPerformance, SingleDatasetPerformance,
                                                     TrainTestPerformance)
@@ -25,6 +26,8 @@
     'ConflictingLabels',
     'SpecialCharacters',
     'UnknownTokens',
+    'UnderAnnotatedMetaDataSegments',
+    'UnderAnnotatedPropertySegments',
 
     # Model Evaluation
     'SingleDatasetPerformance',

diff --git a/deepchecks/nlp/checks/data_integrity/__init__.py b/deepchecks/nlp/checks/data_integrity/__init__.py
@@ -15,6 +15,7 @@
 from .special_characters import SpecialCharacters
 from .text_duplicates import TextDuplicates
 from .text_property_outliers import TextPropertyOutliers
+from .under_annotated_segments import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
 from .unknown_tokens import UnknownTokens
 
 __all__ = [
@@ -24,4 +25,6 @@
     'ConflictingLabels',
     'SpecialCharacters',
     'UnknownTokens',
+    'UnderAnnotatedMetaDataSegments',
+    'UnderAnnotatedPropertySegments',
 ]
diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -54,7 +54,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         context.raise_if_token_classification_task(self)
 
         text_data = context.get_data_by_kind(dataset_kind)
-        text_data = text_data.sample(self.n_samples, random_state=context.random_state)
+        text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)
 
         features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
                                                          columns=self.columns, ignore_columns=self.ignore_columns,
@@ -132,7 +132,8 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
     weakest segments in the data distribution for further improvement and visibility purposes.
 
     The segments are based on the text properties - which are features extracted from the text, such as "language" and
-    "number of words".
+    "number of words". For more on properties, see the `NLP Properties Guide
+    <https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_properties.html>`_.
 
     In order to achieve this, the check trains several simple tree based models which try to predict the error of the
     user provided model on the dataset. The relevant segments are detected by analyzing the different
@@ -197,7 +198,8 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
     weakest segments in the data distribution for further improvement and visibility purposes.
 
     The segments are based on the metadata - which is data that is not part of the text, but is related to it,
-    such as "user_id" and "user_age".
+    such as "user_id" and "user_age". For more on metadata, see the `NLP Metadata Guide
+    <https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_metadata.html>`_.
 
     In order to achieve this, the check trains several simple tree based models which try to predict the error of the
     user provided model on the dataset. The relevant segments are detected by analyzing the different

diff --git a/deepchecks/nlp/suites/default_suites.py b/deepchecks/nlp/suites/default_suites.py
@@ -19,7 +19,8 @@
 from deepchecks.nlp.checks import (ConflictingLabels, LabelDrift, MetadataSegmentsPerformance, PredictionDrift,
                                    PropertyDrift, PropertyLabelCorrelation, PropertySegmentsPerformance,
                                    SpecialCharacters, TextDuplicates, TextPropertyOutliers, TrainTestPerformance,
-                                   TrainTestSamplesMix, UnknownTokens)
+                                   TrainTestSamplesMix, UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments,
+                                   UnknownTokens)
 
 __all__ = ['data_integrity', 'train_test_validation',
            'model_evaluation', 'full_suite']
@@ -63,7 +64,9 @@ def data_integrity(n_samples: int = None,
         TextDuplicates(**kwargs).add_condition_ratio_less_or_equal(),
         ConflictingLabels(**kwargs).add_condition_ratio_of_conflicting_labels_less_or_equal(),
         SpecialCharacters(**kwargs).add_condition_ratio_of_samples_with_special_characters_less_or_equal(),
-        UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal()
+        UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal(),
+        UnderAnnotatedPropertySegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
+        UnderAnnotatedMetaDataSegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
     )
 
 

diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
@@ -23,6 +23,7 @@
 from deepchecks.nlp.task_type import TaskType, TTextLabel
 from deepchecks.nlp.utils.text_properties import calculate_default_properties
 from deepchecks.utils.logger import get_logger
+from deepchecks.utils.metrics import is_label_none
 from deepchecks.utils.validation import is_sequence_not_str
 
 __all__ = ['TextData']
@@ -244,7 +245,7 @@ def sample(self: TDataset, n_samples: int, replace: bool = False, random_state:
         """
         samples = np.arange(len(self))
         if drop_na_label and self.has_label():
-            samples = samples[pd.notnull(self._label)]
+            samples = samples[[not is_label_none(x) for x in self._label]]
         n_samples = min(n_samples, len(samples))
 
         np.random.seed(random_state)

diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -64,8 +64,12 @@ def _target_encode_categorical_features_fill_na(self, data: pd.DataFrame, label_
             else:
                 label_as_int = pd.cut(label_col.astype('float64').fillna(label_col.mean()), bins=10, labels=False)
             df_encoded = t_encoder.fit_transform(df_aggregated, pd.Series(label_as_int, index=df_aggregated.index))
+            # Convert categorical features to ordinal based on their encoded values and store the mapping
             for col in cat_features:
-                values_mapping[col] = pd.concat([df_encoded[col], df_aggregated[col]], axis=1).drop_duplicates()
+                df_encoded[col] = df_encoded[col].apply(sorted(df_encoded[col].unique()).index)
+                mapping = pd.concat([df_encoded[col], df_aggregated[col]], axis=1).drop_duplicates()
+                mapping.columns = ['encoded_value', 'original_category']
+                values_mapping[col] = mapping.sort_values(by='encoded_value')
         else:
             df_encoded = df_aggregated
         self.encoder_mapping = values_mapping
@@ -145,11 +149,11 @@ def _create_heatmap_display(self, data: pd.DataFrame,
             fig = px.imshow(scores, x=f1_labels, y=f2_labels, labels=labels, color_continuous_scale='rdylgn')
             fig.update_traces(text=scores_text, texttemplate='%{text}')
             if segment['Feature2']:
-                title = f'{score_title} (percent of data) {segment["Feature1"]} vs {segment["Feature2"]}'
+                title = f'{score_title} (percent of data)'
                 tab_name = f'{segment["Feature1"]} vs {segment["Feature2"]}'
             else:
-                title = f'{score_title} (percent of data) {segment["Feature1"]}'
-                tab_name = f'{segment["Feature1"]}'
+                title = f'{score_title} (percent of data)'
+                tab_name = segment['Feature1']
             fig.update_layout(
                 title=title,
                 height=600,
@@ -178,8 +182,9 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
             feature_rank_for_search = np.asarray(data.columns)
 
         weak_segments = pd.DataFrame(
-            columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
-        n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
+            columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range',
+                     '% of Data', 'Samples in Segment'])
+        n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None \
             else len(feature_rank_for_search)
         for i in range(n_features):
             for j in range(i + 1, n_features):
@@ -189,18 +194,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
                                                                                   dummy_model, scorer)
                 if weak_segment_score is None or len(weak_segment_filter.filters) == 0:
                     continue
-                data_size = 100 * weak_segment_filter.filter(data).shape[0] / data.shape[0]
+                data_of_segment = weak_segment_filter.filter(data)
+                data_size = round(100 * data_of_segment.shape[0] / data.shape[0], 2)
                 filters = weak_segment_filter.filters
                 if len(filters.keys()) == 1:
                     weak_segments.loc[len(weak_segments)] = [weak_segment_score, list(filters.keys())[0],
                                                              tuple(list(filters.values())[0]), '',
-                                                             None, data_size]
+                                                             None, data_size, list(data_of_segment.index)]
                 else:
                     weak_segments.loc[len(weak_segments)] = [weak_segment_score, feature1,
                                                              tuple(filters[feature1]), feature2,
-                                                             tuple(filters[feature2]), data_size]
+                                                             tuple(filters[feature2]), data_size,
+                                                             list(data_of_segment.index)]
 
-        return weak_segments.drop_duplicates().sort_values(score_title).reset_index(drop=True)
+        # Drop duplicates without considering column 'Samples in Segment'
+        result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
+        result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
+
+        return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
 
     def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
                            label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
@@ -267,6 +278,8 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam
         """Format partition vector for display. If seperator is None returns a list instead of a string."""
         if feature_name == '':
             return ['']
+        if not isinstance(partition_vec, np.ndarray):
+            partition_vec = np.asarray(partition_vec)
 
         result = []
         if feature_name in self.encoder_mapping.keys():

diff --git a/deepchecks/utils/metrics.py b/deepchecks/utils/metrics.py
@@ -12,9 +12,10 @@
 from typing import Union
 
 import numpy as np
+import pandas as pd
 from sklearn.metrics._scorer import _BaseScorer
 
-__all__ = ['get_gain', 'get_scorer_name', 'averaging_mechanism']
+__all__ = ['get_gain', 'get_scorer_name', 'averaging_mechanism', 'is_label_none']
 
 from deepchecks.core.errors import DeepchecksValueError
 
@@ -85,3 +86,11 @@ def averaging_mechanism(averaging_method: str, scores_per_class, weights=None) -
         return np.multiply(scores_per_class, weights).sum() / sum(weights)
     else:
         raise DeepchecksValueError(f'Unknown averaging {averaging_method}')
+
+
+def is_label_none(label):
+    """Check if label (single label of a sample) is None."""
+    result = pd.isnull(label)
+    if isinstance(result, bool):
+        return result
+    return any(result)
diff --git a/deepchecks/utils/single_sample_metrics.py b/deepchecks/utils/single_sample_metrics.py
@@ -36,7 +36,7 @@ def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray,
         if model_classes is not None:
             if any(x not in model_classes for x in labels):
                 raise DeepchecksValueError(
-                    f'Label observed values {sorted(labels.unique())} contain values '
+                    f'Label observed values {sorted(np.unique(labels))} contain values '
                     f'that are not found in the model classes: {model_classes}.')
             if probas.shape[1] != len(model_classes):
                 raise DeepchecksValueError(

diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -0,0 +1,97 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""Test for the NLP UnderAnnotatedSegments check"""
+import numpy as np
+from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises
+
+from deepchecks.core.errors import DeepchecksProcessError
+from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
+from tests.base.utils import equal_condition_result
+
+
+def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
+                             dtype=object)
+
+    check = UnderAnnotatedPropertySegments().add_condition_segments_annotation_ratio_greater_than(0.5)
+    # Act
+    result = check.run(test)
+    condition_result = check.conditions_decision(result)
+
+    # Assert
+    assert_that(condition_result, has_items(
+        equal_condition_result(is_pass=False,
+                               details=r'Most under annotated segment has annotation ratio of 31.43%.',
+                               name=r'In all segments annotation ratio should be greater than 50%.')
+    ))
+
+    assert_that(result.value['avg_score'], close_to(0.5, 0.001))
+    assert_that(len(result.value['weak_segments_list']), close_to(33, 1))
+    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.314, 0.01))
+
+
+def test_tweet_emotion_metadata(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+
+    test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
+                             dtype=object)
+    check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
+    # Act
+    result = check.run(test)
+    condition_result = check.conditions_decision(result)
+
+    # Assert
+    assert_that(condition_result, has_items(
+        equal_condition_result(is_pass=False,
+                               details='Found a segment with annotation ratio of 0.366 in comparison to an average score of 0.5 in sampled data.',
+                               name='The relative performance of weakest segment is greater than 80% of average model performance.')
+    ))
+
+    assert_that(result.value['avg_score'], close_to(0.5, 0.001))
+    assert_that(len(result.value['weak_segments_list']), equal_to(5))
+    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.366, 0.01))
+    assert_that(result.value['weak_segments_list'].iloc[0, 1], equal_to('user_age'))
+
+
+def test_tweet_emotion_metadata_interesting_segment(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+
+    idx_to_change = test.metadata[(test.metadata['user_age'] > 30) & (test.metadata['user_region'] == 'Europe')].index
+    label = test._label.copy().astype(object)
+    label[idx_to_change] = None
+    test._label = label
+
+    # Act
+    result = UnderAnnotatedMetaDataSegments().run(test)
+
+    # Assert
+    assert_that(result.value['avg_score'], close_to(0.844, 0.001))
+    assert_that(len(result.value['weak_segments_list']), equal_to(6))
+    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0, 0.01))
+    assert_that(result.value['weak_segments_list'].iloc[0, 1], equal_to('user_age'))
+
+
+def test_tweet_emotion_metadata_fully_annotated(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
+
+    # Act & Assert
+    assert_that(
+        calling(check.run).with_args(test),
+        raises(DeepchecksProcessError, 'Check was unable to find under annotated segments. This is expected if '
+                                       'your data is well annotated. If this is not the case, try increasing '
+                                       'n_samples or supply more metadata.')
+    )
diff --git a/tests/utils/metrics_test.py b/tests/utils/metrics_test.py
@@ -11,7 +11,7 @@
 """Test metrics utils"""
 import pandas as pd
 from hamcrest import assert_that, calling, close_to, has_entries, is_, raises
-from sklearn.metrics import make_scorer
+from sklearn.metrics import make_scorer, log_loss, mean_squared_error
 
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.tabular import Dataset
@@ -20,6 +20,7 @@
                                                                                false_positive_rate_metric,
                                                                                true_negative_rate_metric)
 from deepchecks.tabular.utils.task_inference import get_all_labels, infer_classes_from_model
+from deepchecks.utils.single_sample_metrics import calculate_neg_cross_entropy_per_sample, calculate_neg_mse_per_sample
 from tests.common import is_nan
 
 
@@ -115,6 +116,21 @@ def test_lending_club_true_negative_rate_scorer_binary(lending_club_split_datase
     assert_that(score, close_to(0.767, 0.01))
 
 
+def test_cross_entropy_lending_club(lending_club_split_dataset_and_model):
+    # Arrange
+    _, test_ds, clf = lending_club_split_dataset_and_model
+    probas = clf.predict_proba(test_ds.features_columns)
+    eps = 1e-15
+
+    # Act
+    score = calculate_neg_cross_entropy_per_sample(test_ds.label_col, probas, eps=eps)
+    score_sklearn = log_loss(test_ds.label_col, probas, eps=eps)
+
+    # Assert
+    assert_that(score.mean(), close_to(-1 * 0.524, 0.01))
+    assert_that(score.mean(), close_to(-1 * score_sklearn, 0.01))
+
+
 def test_iris_true_negative_rate_scorer_multiclass(iris_split_dataset_and_model):
     # Arrange
     _, test_ds, clf = iris_split_dataset_and_model
@@ -200,3 +216,17 @@ def test_scorer_with_only_new_labels_in_data(iris: pd.DataFrame, iris_adaboost):
     assert_that(score, has_entries({
         0: is_(0), 1: is_(0), 2: is_(0), 19: is_nan(), 20: is_nan()
     }))
+
+
+def test_mse_diabetes(diabetes_split_dataset_and_model):
+    # Arrange
+    _, test_ds, clf = diabetes_split_dataset_and_model
+    preds = clf.predict(test_ds.features_columns)
+
+    # Act
+    score = calculate_neg_mse_per_sample(test_ds.label_col, preds)
+    score_sklearn = mean_squared_error(test_ds.label_col, preds)
+
+    # Assert
+    assert_that(score.mean(), close_to(-1 * 3296, 1))
+    assert_that(score.mean(), close_to(-1 * score_sklearn, 0.01))