Change condition of simple model check to use gain (#488)

* Change condition of simple model check to use gain * Fix tests * Fix notebooks * update condition * Add tests * Add tests to models * Add header license * Add changes to the gain * Update NBs for bressler
deepchecks · Jan 5, 2022 · a54f6fe · a54f6fe
1 parent 421d332
commit a54f6fe
Show file tree

Hide file tree

Showing 8 changed files with 1,012 additions and 676 deletions.
diff --git a/deepchecks/checks/performance/simple_model_comparison.py b/deepchecks/checks/performance/simple_model_comparison.py
@@ -9,6 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module containing simple comparison check."""
+from collections import defaultdict
 from typing import Callable, Dict, Hashable, List, cast
 
 import numpy as np
@@ -22,17 +23,18 @@
 from deepchecks import CheckResult, Dataset
 from deepchecks.base.check import ConditionResult, TrainTestBaseCheck
 from deepchecks.checks.distribution.preprocessing import ScaledNumerics
-from deepchecks.utils.strings import format_number
+from deepchecks.utils.strings import format_percent
 from deepchecks.utils.validation import validate_model
 from deepchecks.errors import DeepchecksValueError
 from deepchecks.utils.metrics import (
     task_type_check,
     ModelType,
     initialize_multi_scorers,
     get_scorers_list,
-    get_scores_ratio,
-    get_scorer_single
+    get_scorer_single,
+    get_gain
 )
+from deepchecks.utils.models import RandomModel
 
 
 __all__ = ['SimpleModelComparison']
@@ -51,8 +53,8 @@ class SimpleModelComparison(TrainTestBaseCheck):
     alternative_scorers : Dict[str, Callable], default None
         An optional dictionary of scorer title to scorer functions/names. If none given, using default scorers.
         For description about scorers see Notes below.
-    maximum_ratio : int
-        the ratio can be up to infinity so choose maximum value to limit to.
+    max_gain : float
+        the maximum value for the gain value, limits from both sides [-max_gain, max_gain]
     max_depth : int
         the max depth of the tree (used only if simple model type is tree).
     random_state : int
@@ -89,11 +91,11 @@ def my_mse(y_true, y_pred):
     """
 
     def __init__(self, simple_model_type: str = 'constant', alternative_scorers: Dict[str, Callable] = None,
-                 maximum_ratio: int = 50, max_depth: int = 3, random_state: int = 42):
+                 max_gain: float = 50, max_depth: int = 3, random_state: int = 42):
         super().__init__()
         self.simple_model_type = simple_model_type
         self.alternative_scorers = initialize_multi_scorers(alternative_scorers)
-        self.maximum_ratio = maximum_ratio
+        self.max_gain = max_gain
         self.max_depth = max_depth
         self.random_state = random_state
 
@@ -142,13 +144,27 @@ def run(
         # Multiclass have different return type from the scorer, list of score per class instead of single score
         if task_type in [ModelType.MULTICLASS, ModelType.BINARY]:
             n_samples = label.groupby(label).count()
+            classes = test_dataset.classes
+
+            results_array = []
+            # Dict in format { Scorer : Dict { Class : Dict { Origin/Simple : score } } }
+            results_dict = {}
+            for scorer in scorers:
+                model_dict = defaultdict(dict)
+                for model_name, model_type, model_instance in models:
+                    for class_score, class_value in zip(scorer(model_instance, test_dataset), classes):
+                        model_dict[class_value][model_type] = class_score
+                        results_array.append([model_name,
+                                              model_type,
+                                              class_score,
+                                              scorer.name,
+                                              class_value,
+                                              n_samples[class_value]
+                                              ])
+                results_dict[scorer.name] = model_dict
+
             results_df = pd.DataFrame(
-                [
-                    [model_name, model_type, class_score, scorer.name, class_value, n_samples[class_value]]
-                    for model_name, model_type, model_instance in models
-                    for scorer in scorers  # scorer returns numpy array with item per class
-                    for class_score, class_value in zip(scorer(model_instance, test_dataset), test_dataset.classes)
-                ],
+                results_array,
                 columns=['Model', 'Type', 'Value', 'Metric', 'Class', 'Number of samples']
             )
 
@@ -170,16 +186,26 @@ def run(
             )
 
         else:
+            classes = None
+
+            results_array = []
+            # Dict in format { Scorer : Dict { Origin/Simple : score } }
+            results_dict = {}
+            for scorer in scorers:
+                model_dict = defaultdict(dict)
+                for model_name, model_type, model_instance in models:
+                    score = scorer(model_instance, test_dataset)
+                    model_dict[model_type] = score
+                    results_array.append([model_name,
+                                          model_type,
+                                          score,
+                                          scorer.name,
+                                          label.count()
+                                          ])
+                results_dict[scorer.name] = model_dict
+
             results_df = pd.DataFrame(
-                [
-                    [model_name,
-                     model_type,
-                     scorer(model_instance, test_dataset),
-                     scorer.name,
-                     label.count()]
-                    for model_name, model_type, model_instance in models
-                    for scorer in scorers
-                ],
+                results_array,
                 columns=['Model', 'Type', 'Value', 'Metric', 'Number of samples']
             )
 
@@ -200,7 +226,14 @@ def run(
                 .for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
             )
 
-        return CheckResult({'scores': results_df, 'type': task_type}, display=fig)
+        # For each scorer calculate perfect score in order to calculate later the ratio in conditions
+        scorers_perfect = {scorer.name: scorer.score_perfect(test_dataset) for scorer in scorers}
+
+        return CheckResult({'scores': results_dict,
+                            'type': task_type,
+                            'scorers_perfect': scorers_perfect,
+                            'classes': classes
+                            }, display=fig)
 
     def _create_simple_model(self, train_ds: Dataset, task_type: ModelType):
         """Create a simple model of given type (random/constant/tree) to the given dataset.
@@ -253,73 +286,93 @@ def _create_simple_model(self, train_ds: Dataset, task_type: ModelType):
         simple_model.fit(train_ds.features_columns, train_ds.label_col)
         return simple_model
 
-    def add_condition_ratio_not_less_than(self, min_allowed_ratio: float = 1.1, classes: List[Hashable] = None):
-        """Add condition - require min allowed ratio between the given and the simple model.
+    def add_condition_gain_not_less_than(self,
+                                         min_allowed_gain: float = 0.1,
+                                         classes: List[Hashable] = None,
+                                         average: bool = False):
+        """Add condition - require minimum allowed gain between the model and the simple model.
 
         Args:
-            min_allowed_ratio (float): Min allowed ratio between the given and the simple model -
-            ratio is given model / simple model (if the scorer returns negative values we divide 1 by it)
-            classes (List[Hashable]): Used in multiclass models to limit condition only to given classes.
+            min_allowed_gain (float): Minimum allowed gain between the model and the simple model -
+                gain is: difference in performance / (perfect score - simple score)
+            classes (List[Hashable]): Used in classification models to limit condition only to given classes.
+            average (bool): Used in classification models to flag if to run condition on average of classes, or on
+                each class individually
         """
-        def condition(result: Dict, max_ratio=self.maximum_ratio, class_list=classes) -> ConditionResult:
-            scores_df = result['scores']
-            task_type = result['type']
-            metrics = scores_df['Metric'].unique()
-
-            def get_ratio(df):
-                simple_score = df[df['Type'] == 'Simple']['Value'].iloc[0]
-                origin_score = df[df['Type'] == 'Origin']['Value'].iloc[0]
-                return get_scores_ratio(simple_score, origin_score, max_ratio)
-
-            fails = []
-            if task_type == ModelType.MULTICLASS:
-                if class_list is None:
-                    class_list = scores_df['Class'].unique()
-                for metric in metrics:
-                    failed_classes = []
-                    for clas in class_list:
-                        score_rows = scores_df[(scores_df['Metric'] == metric) & (scores_df['Class'] == clas)]
-                        ratio = get_ratio(score_rows)
-                        if ratio < min_allowed_ratio:
-                            failed_classes.append(str(clas))
-                    if failed_classes:
-                        fails.append(f'"{metric}" - Classes: {", ".join(failed_classes)}')
-            else:
-                for metric in metrics:
-                    score_rows = scores_df[(scores_df['Metric'] == metric)]
-                    ratio = get_ratio(score_rows)
-                    if ratio < min_allowed_ratio:
-                        fails.append(f'"{metric}"')
-
-            if fails:
-                msg = f'Metrics failed: {", ".join(sorted(fails))}'
-                return ConditionResult(False, msg)
-            else:
-                return ConditionResult(True)
-
-        return self.add_condition('$$\\frac{\\text{model score}}{\\text{simple model score}} >= '
-                                  f'{format_number(min_allowed_ratio)}$$', condition)
-
-
-class RandomModel:
-    """Model used to randomly predict from given series of labels."""
-
-    def __init__(self):
-        self.labels = None
-
-    def fit(self, X, y):  # pylint: disable=unused-argument,invalid-name
-        # The X is not used, but it is needed to be matching to sklearn `fit` signature
-        self.labels = y
-
-    def predict(self, X):  # pylint: disable=invalid-name
-        return np.random.choice(self.labels, X.shape[0])
-
-    def predict_proba(self, X):  # pylint: disable=invalid-name
-        classes = sorted(self.labels.unique().tolist())
-        predictions = self.predict(X)
-
-        def prediction_to_proba(y_pred):
-            proba = np.zeros(len(classes))
-            proba[classes.index(y_pred)] = 1
-            return proba
-        return np.apply_along_axis(prediction_to_proba, axis=1, arr=predictions)
+        name = f'Model performance gain over simple model must be at least {format_percent(min_allowed_gain)}'
+        if classes:
+            name = name + f' for classes {str(classes)}'
+        return self.add_condition(name,
+                                  condition,
+                                  include_classes=classes,
+                                  min_allowed_gain=min_allowed_gain,
+                                  max_gain=self.max_gain,
+                                  average=average)
+
+
+def condition(result: Dict, include_classes=None, average=False, max_gain=None, min_allowed_gain=0) -> ConditionResult:
+    scores = result['scores']
+    task_type = result['type']
+    scorers_perfect = result['scorers_perfect']
+
+    fails = []
+    if task_type in [ModelType.MULTICLASS, ModelType.BINARY] and not average:
+        for metric, classes_scores in scores.items():
+            failed_classes = []
+            for clas, models_scores in classes_scores.items():
+                # Skip if class is not in class list
+                if include_classes is not None and clas not in include_classes:
+                    continue
+
+                # If origin model is perfect, skip the gain calculation
+                if models_scores['Origin'] == scorers_perfect[metric]:
+                    continue
+
+                gain = get_gain(models_scores['Simple'],
+                                models_scores['Origin'],
+                                scorers_perfect[metric],
+                                max_gain)
+                if gain < min_allowed_gain:
+                    failed_classes.append(str(clas))
+            if failed_classes:
+                fails.append(f'"{metric}" - Classes: {", ".join(failed_classes)}')
+    else:
+        if task_type in [ModelType.MULTICLASS, ModelType.BINARY]:
+            scores = average_scores(scores, include_classes)
+        for metric, models_scores in scores.items():
+            # If origin model is perfect, skip the gain calculation
+            if models_scores['Origin'] == scorers_perfect[metric]:
+                continue
+            gain = get_gain(models_scores['Simple'],
+                            models_scores['Origin'],
+                            scorers_perfect[metric],
+                            max_gain)
+            if gain < min_allowed_gain:
+                fails.append(f'"{metric}"')
+
+    if fails:
+        msg = f'Metrics failed: {", ".join(sorted(fails))}'
+        return ConditionResult(False, msg)
+    else:
+        return ConditionResult(True)
+
+
+def average_scores(scores, include_classes):
+    result = {}
+    for metric, classes_scores in scores.items():
+        origin_score = 0
+        simple_score = 0
+        total = 0
+        for clas, models_scores in classes_scores.items():
+            # Skip if class is not in class list
+            if include_classes is not None and clas not in include_classes:
+                continue
+            origin_score += models_scores['Origin']
+            simple_score += models_scores['Simple']
+            total += 1
+
+        result[metric] = {
+            'Origin': origin_score / total,
+            'Simple': simple_score / total
+         }
+    return result
diff --git a/deepchecks/suites/default_suites.py b/deepchecks/suites/default_suites.py
@@ -90,7 +90,7 @@ def model_evaluation() -> Suite:
         ConfusionMatrixReport(),
         PerformanceReport().add_condition_train_test_relative_degradation_not_greater_than(),
         RocReport().add_condition_auc_not_less_than(),
-        SimpleModelComparison().add_condition_ratio_not_less_than(),
+        SimpleModelComparison().add_condition_gain_not_less_than(),
         ModelErrorAnalysis().add_condition_segments_performance_relative_difference_not_greater_than(),
         CalibrationScore(),
         TrustScoreComparison().add_condition_mean_score_percent_decline_not_greater_than(),

diff --git a/deepchecks/utils/metrics.py b/deepchecks/utils/metrics.py
@@ -11,6 +11,7 @@
 """Utils module containing utilities for checks working with metrics."""
 import typing as t
 import enum
+import warnings
 from numbers import Number
 
 import numpy as np
@@ -23,6 +24,8 @@
 from deepchecks import base  # pylint: disable=unused-import; it is used for type annotations
 from deepchecks import errors
 from deepchecks.utils import validation
+from deepchecks.utils.strings import is_string_column
+from deepchecks.utils.models import PerfectModel
 
 
 __all__ = [
@@ -36,14 +39,13 @@
     'DEFAULT_BINARY_SCORERS',
     'DEFAULT_MULTICLASS_SCORERS',
     'MULTICLASS_SCORERS_NON_AVERAGE',
-    'get_scores_ratio',
+    'DEFAULT_SINGLE_SCORER_MULTICLASS_NON_AVG',
     'initialize_multi_scorers',
     'get_scorer_single',
-    'task_type_validation'
+    'task_type_validation',
+    'get_gain'
 ]
 
-from deepchecks.utils.strings import is_string_column
-
 
 class ModelType(enum.Enum):
     """Enum containing supported task types."""
@@ -136,6 +138,20 @@ def __call__(self, model, dataset: 'base.Dataset'):
         df = self.filter_nulls(dataset)
         return self._run_score(model, df, dataset)
 
+    def score_perfect(self, dataset: 'base.Dataset'):
+        """Calculate the perfect score of the current scorer for given dataset."""
+        df = self.filter_nulls(dataset)
+        perfect_model = PerfectModel()
+        perfect_model.fit(None, dataset.label_col)
+        score = self._run_score(perfect_model, df, dataset)
+        if isinstance(score, np.ndarray):
+            # We expect the perfect score to be equal for all the classes, so takes the first one
+            first_score = score[0]
+            if any(score != first_score):
+                warnings.warn(f'Scorer {self.name} return different perfect score for differect classes')
+            return first_score
+        return score
+
     def validate_fitting(self, model, dataset: 'base.Dataset', should_return_array: bool):
         """Validate given scorer for the model and dataset."""
         df = self.filter_nulls(dataset)
@@ -313,13 +329,19 @@ def initialize_multi_scorers(alternative_scorers: t.Optional[t.Mapping[str, t.Ca
         return [DeepcheckScorer(scorer, name) for name, scorer in alternative_scorers.items()]
 
 
-def get_scores_ratio(train_score: float, test_score: float, max_ratio=np.Inf) -> float:
-    """Return the ratio of test metric compared to train metric."""
-    if train_score == 0:
-        return max_ratio
-    else:
-        ratio = test_score / train_score
-        if train_score < 0 and test_score < 0:
-            ratio = 1 / ratio
-        ratio = min(max_ratio, ratio)
-        return ratio
+def get_gain(base_score, score, perfect_score, max_gain):
+    """Get gain between base score and score compared to the distance from the perfect score."""
+    distance_from_perfect = perfect_score - base_score
+    scores_diff = score - base_score
+    if distance_from_perfect == 0:
+        # If both base score and score are perfect, return 0 gain
+        if scores_diff == 0:
+            return 0
+        # else base_score is better than score, return -max_gain
+        return -max_gain
+    ratio = scores_diff / distance_from_perfect
+    if ratio < -max_gain:
+        return -max_gain
+    if ratio > max_gain:
+        return max_gain
+    return ratio