Add min_samples and n_top_classes to TrainTestPerformance in NLP (#2558)

* Fixed the bug with 110% degradation * Filtered classes without enough samples * Show top classes
deepchecks · May 27, 2023 · 7c031aa · 7c031aa
1 parent ce15432
commit 7c031aa
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 24 deletions.
diff --git a/deepchecks/nlp/checks/model_evaluation/train_test_performance.py b/deepchecks/nlp/checks/model_evaluation/train_test_performance.py
@@ -10,16 +10,19 @@
 #
 """Module containing the Train-Test Performance check."""
 import typing as t
+from collections import Counter
 from numbers import Number
 
 import numpy as np
 import pandas as pd
 
 from deepchecks.core import CheckResult
+from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp import Context, TrainTestCheck
 from deepchecks.nlp.metric_utils.scorers import infer_on_text_data
 from deepchecks.nlp.task_type import TaskType
 from deepchecks.nlp.text_data import TextData
+from deepchecks.nlp.utils.token_classification_utils import clean_iob_prefixes
 from deepchecks.utils.abstracts.train_test_performace import TrainTestPerformanceAbstract
 
 __all__ = ['TrainTestPerformance']
@@ -33,6 +36,17 @@ class TrainTestPerformance(TrainTestPerformanceAbstract, TrainTestCheck):
     scorers: Union[Mapping[str, Union[str, Callable]], List[str]], default: None
         Scorers to override the default scorers, find more about the supported formats at
         https://docs.deepchecks.com/stable/user-guide/general/metrics_guide.html
+    min_samples: int, default: 30
+        Minimum number of samples required to calculate model performance. If scorer is per class, then
+        this is the minimum number of samples per class.
+    n_top_classes: int, default: 10
+        Number of top classes to show in the label graphs. The top classes are determined by the show_classes_by
+        parameter. if None, then all classes are shown.
+    show_classes_by: str, default: 'test_largest'
+        Specify which classes to show for label graphs, as the number of shown classes is limited
+        by n_top_classes. Possible values:
+        - 'train_largest': Show the n top largest train classes.
+        - 'test_largest': Show the n top largest test classes.
     n_samples : int , default: 1_000_000
         number of samples to use for this check.
     random_state : int, default: 42
@@ -74,18 +88,27 @@ def my_mse(y_true, y_pred):
     """
 
     def __init__(
-        self,
-        scorers: t.Union[
-            t.Mapping[str, t.Union[str, t.Callable]],
-            t.List[str],
-            None
-        ] = None,
-        n_samples: int = 1_000_000,
-        random_state: int = 42,
-        **kwargs
+            self,
+            scorers: t.Union[
+                t.Mapping[str, t.Union[str, t.Callable]],
+                t.List[str],
+                None
+            ] = None,
+            min_samples: int = 30,
+            n_top_classes: int = 10,
+            show_classes_by: str = 'test_largest',
+            n_samples: int = 1_000_000,
+            random_state: int = 42,
+            **kwargs
     ):
         super().__init__(**kwargs)
         self.scorers = scorers
+        self.min_samples = min_samples
+        self.n_top_classes = n_top_classes
+        if n_top_classes and show_classes_by not in ['train_largest', 'test_largest']:
+            raise DeepchecksValueError(f'Invalid value for show_classes_by: {show_classes_by}. Allowed values are '
+                                       '"train_largest" and "test_largest".')
+        self.show_classes_by = show_classes_by
         self.n_samples = n_samples
         self.random_state = random_state
 
@@ -111,9 +134,8 @@ def run_logic(self, context: Context) -> CheckResult:
                 n_of_labels = len(label)
 
             elif context.task_type is TaskType.TOKEN_CLASSIFICATION:
-                # TODO:
-                n_samples_per_class = {}
-                n_of_labels = 0
+                n_samples_per_class = Counter(clean_iob_prefixes(np.concatenate(dataset.label)))
+                n_of_labels = sum(n_samples_per_class.values())
 
             else:
                 raise NotImplementedError()
@@ -155,13 +177,35 @@ def run_logic(self, context: Context) -> CheckResult:
             ]
         )
 
+        # Nullify rows with less than min_samples:
+        results_df.loc[results_df['Number of samples'] < self.min_samples, 'Value'] = None
+        classes_without_enough_samples = results_df[results_df['Class'].notna() & results_df['Value'].isna()]['Class'] \
+            .unique().tolist()
+
+        # Show only top n classes:
+        if self.n_top_classes:
+            samples_per_class = results_df[results_df['Class'].notna()][['Class', 'Dataset', 'Number of samples']] \
+                .drop_duplicates()
+            samples_per_class = samples_per_class[~samples_per_class['Class'].isin(classes_without_enough_samples)]
+
+            if self.show_classes_by == 'train_largest':
+                top_classes_to_show = samples_per_class[samples_per_class['Dataset'] == 'Train'] \
+                    .sort_values('Number of samples', ascending=False).head(self.n_top_classes)['Class'].tolist()
+            else:  # self.show_classes_by == 'test_largest':
+                top_classes_to_show = samples_per_class[samples_per_class['Dataset'] == 'Test'] \
+                    .sort_values('Number of samples', ascending=False).head(self.n_top_classes)['Class'].tolist()
+        else:
+            top_classes_to_show = None
+
         if context.with_display is False:
             figures = None
         else:
             figures = self._prepare_display(
-                results_df,
-                train_dataset.name or 'Train',
-                test_dataset.name or 'Test'
+                results=results_df,
+                train_dataset_name=train_dataset.name or 'Train',
+                test_dataset_name=test_dataset.name or 'Test',
+                classes_without_enough_samples=classes_without_enough_samples,
+                top_classes_to_show=top_classes_to_show
             )
 
         return CheckResult(

diff --git a/deepchecks/utils/abstracts/train_test_performace.py b/deepchecks/utils/abstracts/train_test_performace.py
@@ -31,10 +31,12 @@ class TrainTestPerformanceAbstract(abc.ABC):
     add_condition: t.Callable[..., t.Any]
 
     def _prepare_display(
-        self,
-        results: pd.DataFrame,
-        train_dataset_name: str,
-        test_dataset_name: str,
+            self,
+            results: pd.DataFrame,
+            train_dataset_name: str,
+            test_dataset_name: str,
+            classes_without_enough_samples: t.Optional[t.List[str]] = None,
+            top_classes_to_show: t.Optional[t.List[str]] = None
     ):
         display_df = results.replace({
             'Dataset': {
@@ -47,6 +49,19 @@ def _prepare_display(
         data_scorers_per_class = display_df[results['Class'].notna()]
         data_scorers_per_dataset = display_df[results['Class'].isna()].drop(columns=['Class'])
 
+        # Filter classes without enough samples and get display comment for them:
+        if classes_without_enough_samples:
+            data_scorers_per_class = \
+                data_scorers_per_class.loc[~data_scorers_per_class['Class'].isin(classes_without_enough_samples)]
+
+        # Filter top classes to show:
+        if top_classes_to_show:
+            not_shown_classes = list(set(data_scorers_per_class['Class'].unique()) - set(top_classes_to_show))
+            data_scorers_per_class = \
+                data_scorers_per_class.loc[data_scorers_per_class['Class'].isin(top_classes_to_show)]
+        else:
+            not_shown_classes = None
+
         for data in (data_scorers_per_dataset, data_scorers_per_class):
             if data.shape[0] == 0:
                 continue
@@ -85,6 +100,17 @@ def _prepare_display(
                 )
             )
 
+        # Add comments about not shown classes:
+        df = pd.DataFrame({}, columns=['Reason', 'Classes']).set_index('Reason')
+        if not_shown_classes:
+            df.loc[f'Not shown classes (showing only top {len(top_classes_to_show)})'] = str(not_shown_classes)
+        if classes_without_enough_samples:
+            df.loc[f'Classes without enough samples in either {train_dataset_name} or {test_dataset_name}'] = \
+                str(classes_without_enough_samples)
+
+        if not df.empty:
+            figures.append(df)
+
         return figures
 
     def add_condition_test_performance_greater_than(self: Self, min_score: float) -> Self:
@@ -111,9 +137,9 @@ def add_condition_train_test_relative_degradation_less_than(self: Self, threshol
         return self.add_condition(name, condition)
 
     def add_condition_class_performance_imbalance_ratio_less_than(
-        self: Self,
-        score: str,
-        threshold: float = 0.3,
+            self: Self,
+            score: str,
+            threshold: float = 0.3,
     ) -> Self:
         """Add condition - relative ratio difference between highest-class and lowest-class is less than threshold.
 

diff --git a/tests/nlp/checks/model_evaluation/train_test_performance_test.py b/tests/nlp/checks/model_evaluation/train_test_performance_test.py
@@ -68,7 +68,7 @@ def test_check_execution(self):
             task_type='text_classification'
         )
         test = train.copy()
-        check = TrainTestPerformance()
+        check = TrainTestPerformance(min_samples=0)
         # Act
         result = check.run(
             train_dataset=train,
@@ -153,10 +153,55 @@ def test_check_execution_with_wrong_model_classes(self):
             raises(DeepchecksValueError, 'Received model_classes of length 4, but data indicates labels of length 3')
         )
 
+    def test_display_params(self):
+        # Arrange
+        train = TextData(
+            raw_text=['I think therefore I am' for _ in range(100)],
+            label=[
+                *([0, 0, 1] for _ in range(50)),
+                *([0, 1, 1] for _ in range(50))
+            ],
+            task_type='text_classification'
+        )
+        test = train.copy()
+        check = TrainTestPerformance(min_samples=101)
+        # Act
+        result = check.run(
+            train_dataset=train,
+            test_dataset=test,
+            train_predictions=list(train.label),
+            test_predictions=list(test.label),
+        )
+        # Assert
+        assert isinstance(result.value, pd.DataFrame), type(result.value)
+        assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
+        assert set(result.value["Dataset"]) == {"Train", "Test"}
+        assert result.value["Value"].notna().sum() == 0  # all values are NaNs
+
+        check = TrainTestPerformance(n_top_classes=1, show_classes_by='test_largest')
+        # Act
+        result = check.run(
+            train_dataset=train,
+            test_dataset=test,
+            train_predictions=list(train.label),
+            test_predictions=list(test.label),
+        )
+        # Assert
+        assert isinstance(result.value, pd.DataFrame), type(result.value)
+        assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
+        assert set(result.value["Dataset"]) == {"Train", "Test"}
+        assert isinstance(result.display[1], pd.DataFrame)
+        assert result.display[1]['Classes'].loc['Not shown classes (showing only top 1)'] == '[1]'
+        assert result.display[1]['Classes'].loc['Classes without enough samples in either Train or Test'] == '[0]'
+        assert result.display[0].data[0]['x'].shape == (1,)  # Make sure x-axis has only 1 class
+
+        assert_that(calling(TrainTestPerformance).with_args(show_classes_by='blabla'),
+                    raises(DeepchecksValueError))
+
 
 class TestTokenClassification:
 
-    def test_check_execution(self, small_wikiann_train_test_text_data):
+    def test_check_execution_macro(self, small_wikiann_train_test_text_data):
         # Arrange
         train, test = small_wikiann_train_test_text_data
         scorers = ["recall_macro", "f1_macro"]
@@ -173,3 +218,21 @@ def test_check_execution(self, small_wikiann_train_test_text_data):
         assert set(result.value["Metric"]) == set(scorers)
         assert set(result.value["Dataset"]) == {"Train", "Test"}
         assert set(result.value["Value"]) == {1.0}
+
+    def test_check_execution_micro(self, small_wikiann_train_test_text_data):
+        # Arrange
+        train, test = small_wikiann_train_test_text_data
+        check = TrainTestPerformance(min_samples=50)
+        # Act
+        result = check.run(
+            train_dataset=train,
+            test_dataset=test,
+            train_predictions=list(train.label),
+            test_predictions=list(test.label),
+        )
+        # Assert
+        assert isinstance(result.value, pd.DataFrame), type(result.value)
+        assert set(result.value["Dataset"]) == {"Train", "Test"}
+        assert set(result.value[result.value["Value"].notna()]["Value"]) == {1.0}
+        assert set(result.value[(result.value["Value"].notna()) & (result.value["Dataset"] == 'Train')]["Class"])\
+               == {'PER', 'ORG'}  # LOC has only 49 samples