deepchecks · noamzbr · May 9, 2023 · May 6, 2023 · May 8, 2023 · May 8, 2023
@@ -13,6 +13,7 @@
 
 import numpy as np
 import pandas as pd
+from sklearn.decomposition import TruncatedSVD
 
 from deepchecks.core import CheckResult
 from deepchecks.core.check_result import DisplayMap
@@ -51,7 +52,6 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
         context.raise_if_token_classification_task(self)
-        context.raise_if_multi_label_task(self)
 
         text_data = context.get_data_by_kind(dataset_kind)
         text_data = text_data.sample(self.n_samples, random_state=context.random_state)
@@ -61,8 +61,19 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                                                          n_top_features=self.n_top_features)
 
         # Decide which scorer and score_per_sample to use in the algorithm run
-        encoded_dataset = self._target_encode_categorical_features_fill_na(features, text_data.label,
-                                                                           cat_features, is_cat_label=True)
+        is_multilabel = text_data.is_multi_label_classification()
+        if is_multilabel:
+            label = TruncatedSVD(1).fit_transform(text_data.label).squeeze()
+            is_cat_label = False
+        else:
+            label = text_data.label
+            is_cat_label = True
+        encoded_dataset = self._target_encode_categorical_features_fill_na(features, label,
+                                                                           cat_features, is_cat_label=is_cat_label)
+        # Replacing the label with the original label for multilabel
+        if is_multilabel:
+            listed_label = [list(x) for x in text_data.label]
+            encoded_dataset._data[encoded_dataset.label_name] = listed_label  # pylint: disable=protected-access
         if self.score_per_sample is not None:
             score_per_sample = self.score_per_sample[list(features.index)]
             scorer, dummy_model = None, None
@@ -77,7 +88,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                         'rather than only predicted classes.')
                 y_proba = context.model.predict_proba(text_data)
                 score_per_sample = calculate_neg_cross_entropy_per_sample(text_data.label, np.asarray(y_proba),
-                                                                          context.model_classes)
+                                                                          is_multilabel=is_multilabel,
+                                                                          model_classes=context.model_classes)
             else:
                 raise DeepchecksNotSupportedError('Weak segments performance check is not supported for '
                                                   f'{context.task_type}.')

@@ -113,7 +113,6 @@ def __init__(self,
                     if (y_pred is None) and (y_proba is not None):
                         if dataset.is_multi_label_classification():
                             y_pred = (np.array(y_proba) > 0.5)  # TODO: Replace with user-configurable threshold
-                            y_pred = [np.array(model_classes)[pred] for pred in y_pred]
                         else:
                             y_pred = np.argmax(np.array(y_proba), axis=-1)
                             y_pred = np.array(model_classes, dtype='str')[y_pred]

@@ -63,32 +63,41 @@ class _DummyModel:
     predictions: pd.DataFrame
     proba: pd.DataFrame
 
-    def __init__(self,
-                 test: Dataset,
-                 y_proba_test: t.Optional[np.ndarray] = None,
-                 y_pred_test: t.Optional[np.ndarray] = None,
-                 train: t.Union[Dataset, None] = None,
-                 y_pred_train: t.Optional[np.ndarray] = None,
-                 y_proba_train: t.Optional[np.ndarray] = None,
-                 validate_data_on_predict: bool = True,
-                 model_classes: t.Optional[t.List] = None):
-
+    def __init__(
+        self,
+        test: Dataset,
+        y_proba_test: t.Optional[np.ndarray] = None,
+        y_pred_test: t.Optional[np.ndarray] = None,
+        train: t.Optional[Dataset] = None,
+        y_pred_train: t.Optional[np.ndarray] = None,
+        y_proba_train: t.Optional[np.ndarray] = None,
+        validate_data_on_predict: bool = True,
+        model_classes: t.Optional[t.List[t.Any]] = None
+    ):
+        # TODO:
+        # constructor signature says that test cannot be `None`
+        # why do we check it then?
         if train is not None and test is not None:
             # check if datasets have same indexes
-            if set(train.data.index) & set(test.data.index):
-                train.data.index = map(lambda x: f'train-{x}', list(train.data.index))
-                test.data.index = map(lambda x: f'test-{x}', list(test.data.index))
-                get_logger().warning('train and test datasets have common index - adding "train"/"test"'
-                                     ' prefixes. To avoid that provide datasets with no common indexes '
-                                     'or pass the model object instead of the predictions.')
+            train_index = train.data.index
+            test_index = test.data.index
+            if set(train_index) & set(test_index):
+                train.data.index = [f'train-{it}' for it in train_index]
+                test.data.index = [f'test-{it}' for it in test_index]
+                get_logger().warning(
+                    'train and test datasets have common index - adding "train"/"test" '
+                    'prefixes. To avoid that provide datasets with no common indexes '
+                    'or pass the model object instead of the predictions.'
+                )
 
         feature_df_list = []
         predictions = []
         probas = []
 
-        for dataset, y_pred, y_proba in zip([train, test],
-                                            [y_pred_train, y_pred_test],
-                                            [y_proba_train, y_proba_test]):
+        for dataset, y_pred, y_proba in (
+            (train, y_pred_train, y_proba_train),
+            (test, y_pred_test, y_proba_test),
+        ):
             if y_pred is not None and not isinstance(y_pred, np.ndarray):
                 y_pred = np.array(y_pred)
             if y_proba is not None and not isinstance(y_proba, np.ndarray):
@@ -103,7 +112,7 @@ def __init__(self,
                     if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
                         y_pred = y_pred[:, 0]
                     ensure_predictions_shape(y_pred, dataset.data)
-                    y_pred_ser = pd.Series(y_pred, index=dataset.data.index)
+                    y_pred_ser = pd.Series(list(y_pred), index=dataset.data.index)
                     predictions.append(y_pred_ser)
                     if y_proba is not None:
                         ensure_predictions_proba(y_proba, y_pred)

@@ -41,6 +41,7 @@
 from deepchecks.utils.metrics import get_scorer_name
 from deepchecks.utils.simple_models import PerfectModel
 from deepchecks.utils.typing import BasicModel
+from deepchecks.utils.validation import is_sequence_not_str
 
 if TYPE_CHECKING:
     from deepchecks import tabular  # pylint: disable=unused-import; it is used for type annotations
@@ -267,6 +268,10 @@ def predict(self, data: pd.DataFrame) -> np.ndarray:
                     predictions = transfer_func(predictions)
                 # In case of multiclass with single label, convert into multi-label
                 elif self.model_classes:
+                    # if multilabel convert from numpy array of lists to 2d numpy array
+                    if len(predictions) != 0:
+                        if is_sequence_not_str(next(iter(predictions))):
+                            predictions = np.array([np.array(x) for x in predictions])
                     predictions = _transform_to_multi_label_format(predictions, self.model_classes)
                 return predictions
 
@@ -309,6 +314,10 @@ def _run_score(self, model, data: pd.DataFrame, label_col: pd.Series):
                                                       f'{label_col.unique()}')
                 label_col = label_col.map({self.model_classes[0]: 0, self.model_classes[1]: 1})
             else:
+                # if multilabel convert from series of lists to 2d numpy array
+                if len(label_col) != 0:
+                    if is_sequence_not_str(next(iter(label_col))):
+                        label_col = np.array([np.array(x) for x in label_col])
                 label_col = _transform_to_multi_label_format(np.array(label_col), self.model_classes)
 
         try:

@@ -24,26 +24,30 @@ def calculate_neg_mse_per_sample(labels, predictions, index=None) -> pd.Series:
     return pd.Series([-(y - y_pred) ** 2 for y, y_pred in zip(labels, predictions)], index=index)
 
 
-def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray, model_classes: Optional[List] = None,
-                                           index=None, eps=1e-15) -> pd.Series:
+def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray,
+                                           model_classes: Optional[List] = None,
+                                           index=None, is_multilabel: bool = False, eps=1e-15) -> pd.Series:
     """Calculate negative cross entropy per sample."""
-    if index is None and isinstance(labels, pd.Series):
-        index = labels.index
-
-    # transform categorical labels into integers
-    if model_classes is not None:
-        if any(x not in model_classes for x in labels):
-            raise DeepchecksValueError(
-                f'Label observed values {sorted(labels.unique())} contain values '
-                f'that are not found in the model classes: {model_classes}.')
-        if probas.shape[1] != len(model_classes):
-            raise DeepchecksValueError(
-                f'Predicted probabilities shape {probas.shape} does not match the number of classes found in'
-                f' the labels: {model_classes}.')
-        labels = pd.Series(labels).apply(list(model_classes).index)
-
-    num_samples, num_classes = probas.shape
-    one_hot_labels = np.zeros((num_samples, num_classes))
-    one_hot_labels[list(np.arange(num_samples)), list(labels)] = 1
+    if not is_multilabel:
+        if index is None and isinstance(labels, pd.Series):
+            index = labels.index
+
+        # transform categorical labels into integers
+        if model_classes is not None:
+            if any(x not in model_classes for x in labels):
+                raise DeepchecksValueError(
+                    f'Label observed values {sorted(labels.unique())} contain values '
+                    f'that are not found in the model classes: {model_classes}.')
+            if probas.shape[1] != len(model_classes):
+                raise DeepchecksValueError(
+                    f'Predicted probabilities shape {probas.shape} does not match the number of classes found in'
+                    f' the labels: {model_classes}.')
+            labels = pd.Series(labels).apply(list(model_classes).index)
+
+        num_samples, num_classes = probas.shape
+        one_hot_labels = np.zeros((num_samples, num_classes))
+        one_hot_labels[list(np.arange(num_samples)), list(labels)] = 1
+    else:
+        one_hot_labels = labels
 
     return pd.Series(np.sum(one_hot_labels * np.log(probas + eps), axis=1), index=index)
@@ -9,13 +9,33 @@
 # ----------------------------------------------------------------------------
 #
 """Test for the NLP WeakSegmentsPerformance check"""
+import numpy as np
 import pytest
 from hamcrest import assert_that, close_to, equal_to, has_items
 
 from deepchecks.nlp.checks import MetadataSegmentsPerformance, PropertySegmentsPerformance
 from tests.base.utils import equal_condition_result
 
 
+@pytest.fixture
+def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata):
+    """Mock dataset and probabilities for multilabel classification"""
+    from sklearn.datasets import make_multilabel_classification
+    from sklearn.model_selection import train_test_split
+    from sklearn.linear_model import LogisticRegression
+
+    X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2,
+                                          random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    probabilities = np.zeros(y_test.shape)
+    for label_dim in range(y.shape[1]):
+        clf = LogisticRegression(random_state=42).fit(X_train, y_train[:, label_dim])
+        probabilities[:, label_dim] = clf.predict_proba(X_test)[:, 1]
+    data = tweet_emotion_train_test_textdata[1].sample(len(probabilities), random_state=42)
+    data._label = y_test
+    return data, probabilities
+
+
 def test_tweet_emotion(tweet_emotion_train_test_textdata, tweet_emotion_train_test_probabilities):
     # Arrange
     _, test = tweet_emotion_train_test_textdata
@@ -79,3 +99,26 @@ def test_warning_of_n_top_columns(tweet_emotion_train_test_textdata, tweet_emoti
         _ = property_check.run(test, probabilities=test_probas)
     with pytest.warns(UserWarning, match=metadata_warning):
         _ = metadata_check.run(test, probabilities=test_probas)
+
+
+def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
+    # Arrange
+    data, probabilities = multilabel_mock_dataset_and_probabilities
+    assert_that(data.is_multi_label_classification(), equal_to(True))
+    check = MetadataSegmentsPerformance().add_condition_segments_relative_performance_greater_than()
+    # Act
+    result = check.run(data, probabilities=probabilities)
+    condition_result = check.conditions_decision(result)
+
+    # Assert
+    assert_that(condition_result, has_items(
+        equal_condition_result(is_pass=False,
+                               details='Found a segment with accuracy score of 0.395 in comparison to an average '
+                                       'score of 0.624 in sampled data.',
+                               name='The relative performance of weakest segment is greater than 80% of average model '
+                                    'performance.')
+    ))
+
+    assert_that(result.value['avg_score'], close_to(0.624, 0.001))
+    assert_that(len(result.value['weak_segments_list']), equal_to(5))
+    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.395, 0.01))