custom metrics reuse bug (#1180)

* Fix metric.update * Add copy and reset of alternative metrics * Remove copy from metrics utils * Fix lint * Add copy to metric utils * Undo copy metrics in image segment performance * Fix lint * Fix class performance test * Fix robustness report
deepchecks · Apr 5, 2022 · a93c326 · a93c326
1 parent c5e78f3
commit a93c326
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 11 deletions.
diff --git a/deepchecks/vision/checks/performance/image_segment_performance.py b/deepchecks/vision/checks/performance/image_segment_performance.py
@@ -69,6 +69,7 @@ def __init__(
         self.number_of_bins = number_of_bins
         self.number_of_samples_to_infer_bins = number_of_samples_to_infer_bins
         self._state = None
+        self._metrics = None
 
     def initialize_run(self, context: Context, dataset_kind: DatasetKind):
         """Initialize run before starting updating on batches."""
@@ -78,7 +79,6 @@ def initialize_run(self, context: Context, dataset_kind: DatasetKind):
 
     def update(self, context: Context, batch: Batch, dataset_kind: DatasetKind):
         """Update the bins by the image properties."""
-        dataset = context.get_data_by_kind(dataset_kind)
         images = batch.images
         predictions = batch.predictions
         labels = batch.labels
@@ -102,7 +102,8 @@ def update(self, context: Context, batch: Batch, dataset_kind: DatasetKind):
             # Check if enough data to infer bins
             if len(samples_for_bin) >= self.number_of_samples_to_infer_bins:
                 # Create the bins and metrics, and divide all cached data into the bins
-                self._state['bins'] = self._create_bins_and_metrics(samples_for_bin, dataset)
+                self._state['bins'] = self._create_bins_and_metrics(samples_for_bin,
+                                                                    context.get_data_by_kind(dataset_kind))
                 # Remove the samples cache which are no longer needed (free the memory)
                 del samples_for_bin
 
@@ -278,11 +279,19 @@ def _add_to_fitting_bin(bins: t.List[t.Dict], property_value, label, prediction)
         if single_bin['start'] <= property_value < single_bin['stop']:
             single_bin['count'] += 1
             for metric in single_bin['metrics'].values():
-                # Since this is a single prediction and label need to wrap in tensor
-                metric.update((torch.unsqueeze(prediction, 0), torch.unsqueeze(label, 0)))
+                # Since this is a single prediction and label need to wrap in tensor/label, in order to pass the
+                # expected shape to the metric
+                metric.update((_wrap_torch_or_list(prediction), _wrap_torch_or_list(label)))
             return
 
 
+def _wrap_torch_or_list(value):
+    """Unsqueeze the value if it is a tensor or wrap in list otherwise."""
+    if isinstance(value, torch.Tensor):
+        return torch.unsqueeze(value, 0)
+    return [value]
+
+
 def _range_string(start, stop, precision):
     start = '[' + format_number(start, precision) if not np.isinf(start) else '(-inf'
     stop = format_number(stop, precision) if not np.isinf(stop) else 'inf'

diff --git a/deepchecks/vision/metrics_utils/metrics.py b/deepchecks/vision/metrics_utils/metrics.py
@@ -10,6 +10,7 @@
 #
 """Module for defining metrics for the vision module."""
 import typing as t
+from copy import copy
 
 import numpy as np
 import pandas as pd
@@ -67,11 +68,16 @@ def get_scorers_list(
     task_type = dataset.task_type
 
     if alternative_scorers:
-        # Validate that each alternative scorer is a correct type
-        for _, met in alternative_scorers.items():
+        # For alternative scorers we create a copy since in suites we are running in parallel, so we can't use the same
+        # instance for several checks.
+        scorers = {}
+        for name, met in alternative_scorers.items():
+            # Validate that each alternative scorer is a correct type
             if not isinstance(met, Metric):
                 raise DeepchecksValueError('alternative_scorers should contain metrics of type ignite.Metric')
-        scorers = alternative_scorers
+            met.reset()
+            scorers[name] = copy(met)
+        return scorers
     elif task_type == TaskType.CLASSIFICATION:
         scorers = get_default_classification_scorers()
     elif task_type == TaskType.OBJECT_DETECTION:

diff --git a/tests/vision/checks/performance/class_performance_test.py b/tests/vision/checks/performance/class_performance_test.py
@@ -99,8 +99,8 @@ def test_mnist_alt(mnist_dataset_train, mnist_dataset_test, mock_trained_mnist,
     r_row = result.value.loc[result.value['Metric'] == 'r'].sort_values(by='Value', ascending=False).iloc[0]
     # Assert
     assert_that(len(result.value), equal_to(8))
-    assert_that(p_row['Value'], close_to(0.975, 0.001))
-    assert_that(r_row['Value'], close_to(0.985, 0.001))
+    assert_that(p_row['Value'], close_to(.984, 0.001))
+    assert_that(r_row['Value'], close_to(0.988, 0.001))
 
 
 def test_coco_best(coco_train_visiondata, coco_test_visiondata, mock_trained_yolov5_object_detection, device):

diff --git a/tests/vision/checks/performance/robustness_report_test.py b/tests/vision/checks/performance/robustness_report_test.py
@@ -65,8 +65,8 @@ def test_coco_and_condition(coco_train_visiondata, mock_trained_yolov5_object_de
     # Assert
     assert_that(result.value, has_entries({
         'Hue Saturation Value': has_entries({
-            'AP': has_entries(score=close_to(0.348, 0.001), diff=close_to(-0.107, 0.001)),
-            'AR': has_entries(score=close_to(0.376, 0.001), diff=close_to(-0.092, 0.001))
+            'AP': has_entries(score=close_to(0.348, 0.01), diff=close_to(-0.104, 0.01)),
+            'AR': has_entries(score=close_to(0.376, 0.01), diff=close_to(-0.075, 0.01))
         }),
     }))
     assert_that(result.conditions_results, has_items(