deepchecks · noamzbr · May 7, 2023 · May 3, 2023 · May 3, 2023 · May 4, 2023
@@ -54,7 +54,7 @@ def run(
             predictions on dataset
         probabilities: Union[TTextProba, None] , default: None
             probabilities on dataset
-        model_classes: Optional[List, List[List], default: None
+        model_classes: Optional[List], default: None
             For classification: list of classes known to the model
         random_state : int, default 42
             A seed to set for pseudo-random functions, primarily sampling.
@@ -99,6 +99,7 @@ def run(
             test_predictions: Optional[TTextPred] = None,
             train_probabilities: Optional[TTextProba] = None,
             test_probabilities: Optional[TTextProba] = None,
+            model_classes: Optional[List] = None,
             random_state: int = 42,
     ) -> CheckResult:
         """Run check.
@@ -121,6 +122,8 @@ def run(
             probabilities on train dataset
         test_probabilities: Union[TTextProba, None] , default: None
             probabilities on test_dataset dataset
+        model_classes: Optional[List], default: None
+            For classification: list of classes known to the model
         random_state : int, default 42
             A seed to set for pseudo-random functions, primarily sampling.
 
@@ -134,6 +137,7 @@ def run(
             test_pred=test_predictions,
             train_proba=train_probabilities,
             test_proba=test_probabilities,
+            model_classes=model_classes,
             random_state=random_state,
             with_display=with_display,
         )

@@ -131,7 +131,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
             'conflicting_samples': result_df,
         }
 
-        if context.with_display is False:
+        if context.with_display is False or num_of_ambiguous_samples == 0:
             return CheckResult(value=result_value)
 
         ambiguous_samples['Text'] = ambiguous_samples['Text'].apply(self._truncate_text)

@@ -155,6 +155,8 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
 
         # Truncate labels for display
         labels = [truncate_string(label, self.max_text_length_for_display) for label in labels]
+        # round percentages to 2 decimal places after the percent
+        percentages = [round(percent, 2) for percent in percentages]
 
         # Create pie chart with hover text and custom hover template
         fig = go.Figure(data=[go.Pie(
@@ -170,7 +172,9 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
         )])
 
         # Customize chart appearance
-        fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer',
+        fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer<br>'
+                                f'({format_percent(sum(percentages) / 100.)} of all words)',
+                          title_x=0.5,
                           legend_title='Words with Unknown Tokens')
 
         return fig

@@ -31,16 +31,20 @@ class SingleDatasetPerformance(SingleDatasetCheck, BaseSingleDatasetPerformance)
     scorers : Union[List[str], Dict[str, Union[str, Callable]]], default: None
         List of scorers to use. If None, use default scorers.
         Scorers can be supplied as a list of scorer names or as a dictionary of names and functions.
+    max_rows_to_display : int, default: 15
+        Maximum number of rows to display in the check result.
     n_samples : int , default: 10_000
         Maximum number of samples to use for this check.
     """
 
     def __init__(self,
                  scorers: Union[List[str], Dict[str, Union[str, Callable]]] = None,
+                 max_rows_to_display: int = 15,
                  n_samples: int = 10_000,
                  **kwargs):
         super().__init__(**kwargs)
         self.scorers = scorers
+        self.max_rows_to_display = max_rows_to_display
         self.n_samples = n_samples
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
@@ -62,7 +66,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         results_df = pd.DataFrame(results, columns=['Class', 'Metric', 'Value'])
 
         if context.with_display:
-            display = [results_df]
+            if len(results_df) > self.max_rows_to_display:
+                display = [results_df.iloc[:self.max_rows_to_display, :],
+                           '<p style="font-size:0.9em;line-height:1;"><i>'
+                           f'* Only showing first {self.max_rows_to_display} rows.']
+            else:
+                display = [results_df]
         else:
             display = []
 

@@ -102,7 +102,7 @@ def run_logic(self, context: Context) -> CheckResult:
         for dataset_name, dataset in datasets.items():
 
             if context.task_type is TaskType.TEXT_CLASSIFICATION and dataset.is_multi_label_classification():
-                n_samples_per_class = dict(enumerate(np.array(dataset.label).sum(axis=0)))
+                n_samples_per_class = dict(zip(context.model_classes, np.array(dataset.label).sum(axis=0)))
                 n_of_labels = sum(n_samples_per_class.values())
 
             elif context.task_type is TaskType.TEXT_CLASSIFICATION:

@@ -191,7 +191,7 @@ def _validate_prediction(dataset: TextData, prediction: TTextPred, n_classes: in
     @staticmethod
     def _validate_classification_prediction(dataset: TextData, prediction: TTextPred, n_classes: int):
         """Validate prediction for given text classification dataset."""
-        classification_format_error = f'Check requires classification for {dataset.name} to be ' \
+        classification_format_error = f'Check requires classification predictions for {dataset.name} to be ' \
                                       f'either a sequence that can be cast to a 1D numpy array of shape' \
                                       f' (n_samples,), or a sequence of sequences that can be cast to a 2D ' \
                                       f'numpy array of shape (n_samples, n_classes) for the multilabel case.'

@@ -8,9 +8,9 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-"""Module for base tabular abstractions."""
+"""Module for base nlp suite."""
 # pylint: disable=broad-except
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 from deepchecks.core import DatasetKind
 from deepchecks.core.check_result import CheckFailure
@@ -42,6 +42,7 @@ def run(
         test_predictions: Optional[TTextPred] = None,
         train_probabilities: Optional[TTextProba] = None,
         test_probabilities: Optional[TTextProba] = None,
+        model_classes: Optional[List] = None,
         random_state: int = 42,
     ) -> SuiteResult:
         """Run all checks.
@@ -62,6 +63,8 @@ def run(
             probabilities on train dataset
         test_probabilities: Union[TTextProba, None] , default: None
             probabilities on test_dataset dataset
+        model_classes: Optional[List], default: None
+            For classification: list of classes known to the model
         random_state : int, default 42
             A seed to set for pseudo-random functions, primarily sampling.
 
@@ -79,6 +82,7 @@ def run(
             test_pred=test_predictions,
             train_proba=train_probabilities,
             test_proba=test_probabilities,
+            model_classes=model_classes,
             with_display=with_display,
             random_state=random_state
         )

@@ -14,6 +14,7 @@
 import pandas as pd
 from hamcrest import *
 
+from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.checks import TrainTestPerformance
 from deepchecks.nlp.text_data import TextData
 
@@ -93,6 +94,65 @@ def test_check_execution(self):
 
         assert n_of_samples_per_class == expected_n_of_samples_per_class
 
+    def test_check_execution_with_model_classes(self):
+        train = TextData(
+            raw_text=['I think therefore I am' for _ in range(20)],
+            label=[
+                *([0, 0, 1] for _ in range(10)),
+                *([0, 1, 1] for _ in range(10))
+            ],
+            task_type='text_classification'
+        )
+        test = train.copy()
+        check = TrainTestPerformance()
+        # Act
+        result = check.run(
+            train_dataset=train,
+            test_dataset=test,
+            train_predictions=list(train.label),
+            test_predictions=list(test.label),
+            model_classes=['a', 'b', 'c']
+        )
+        # Assert
+        assert isinstance(result.value, pd.DataFrame), type(result.value)
+        assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
+        assert set(result.value["Dataset"]) == {"Train", "Test"}
+
+        n_of_samples_per_class = (
+            result.value[(result.value["Metric"] == "F1") & (result.value["Dataset"] == "Train")]
+            .loc[:, ['Class', 'Number of samples']]
+            .groupby('Class')
+            .sum()
+            .to_dict()
+        )
+        expected_n_of_samples_per_class = {
+            'Number of samples': {'a': 0, 'b': 10, 'c': 20}
+        }
+
+        assert n_of_samples_per_class == expected_n_of_samples_per_class
+
+    def test_check_execution_with_wrong_model_classes(self):
+        train = TextData(
+            raw_text=['I think therefore I am' for _ in range(20)],
+            label=[
+                *([0, 0, 1] for _ in range(10)),
+                *([0, 1, 1] for _ in range(10))
+            ],
+            task_type='text_classification'
+        )
+        test = train.copy()
+        check = TrainTestPerformance()
+
+        # Act & Assert
+        assert_that(calling(check.run).with_args(
+            train_dataset=train,
+            test_dataset=test,
+            train_predictions=list(train.label),
+            test_predictions=list(test.label),
+            model_classes=['a', 'b', 'c', 'd']),
+            raises(DeepchecksValueError, 'Received model_classes of length 4, but data indicates labels of length 3')
+        )
+
 
 class TestTokenClassification:
 

@@ -15,7 +15,7 @@
 from deepchecks.nlp import Suite
 from deepchecks.nlp.checks import LabelDrift, SingleDatasetPerformance
 
-CLASSIFICATION_ERROR_FORMAT = r'Check requires classification for Train to be ' \
+CLASSIFICATION_ERROR_FORMAT = r'Check requires classification predictions for Train to be ' \
                               r'either a sequence that can be cast to a 1D numpy array of shape' \
                               r' \(n_samples,\), or a sequence of sequences that can be cast to a 2D ' \
                               r'numpy array of shape \(n_samples, n_classes\) for the multilabel case.'

@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Test for the default suites"""
-from deepchecks.nlp.suites import full_suite
+from deepchecks.nlp.suites import model_evaluation, full_suite
 from tests.common import get_expected_results_length, validate_suite_result
 
 
@@ -30,3 +30,24 @@ def test_full_suite(tweet_emotion_train_test_textdata, tweet_emotion_train_test_
     # Assert
     length = get_expected_results_length(suite, kwargs)
     validate_suite_result(result, length)
+
+
+def test_model_eval_suite_with_model_classes_argument(tweet_emotion_train_test_textdata,
+                                                      tweet_emotion_train_test_predictions,
+                                                      tweet_emotion_train_test_probabilities):
+    # Arrange
+    train_data, test_data = tweet_emotion_train_test_textdata
+    train_preds, test_preds = tweet_emotion_train_test_predictions
+    train_probas, test_probas = tweet_emotion_train_test_probabilities
+
+    kwargs = dict(train_dataset=train_data, test_dataset=test_data, train_predictions=train_preds,
+                  test_predictions=test_preds, train_probabilities=train_probas, test_probabilities=test_probas,
+                  model_classes=['anger', 'happiness', 'optimism', 'sadness'])
+
+    # Act
+    suite = model_evaluation()
+    result = suite.run(**kwargs)
+
+    # Assert
+    length = get_expected_results_length(suite, kwargs)
+    validate_suite_result(result, length)