-
Notifications
You must be signed in to change notification settings - Fork 247
/
abstract_property_outliers.py
377 lines (331 loc) · 19.3 KB
/
abstract_property_outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module contains AbstractPropertyOutliers check."""
import string
import typing as t
import warnings
from abc import abstractmethod
from collections import defaultdict
from numbers import Number
from secrets import choice
import numpy as np
import pandas as pd
from deepchecks.core import CheckResult, DatasetKind
from deepchecks.core.errors import DeepchecksProcessError, NotEnoughSamplesError
from deepchecks.utils.outliers import iqr_outliers_range
from deepchecks.utils.strings import format_number
from deepchecks.vision.base_checks import SingleDatasetCheck
from deepchecks.vision.context import Context
from deepchecks.vision.utils.image_functions import draw_image
from deepchecks.vision.utils.vision_properties import PropertiesInputType
from deepchecks.vision.vision_data import TaskType, VisionData
from deepchecks.vision.vision_data.batch_wrapper import BatchWrapper
__all__ = ['AbstractPropertyOutliers']
class AbstractPropertyOutliers(SingleDatasetCheck):
"""Find outliers samples with respect to the given properties.
The check computes several properties and then computes the number of outliers for each property.
The check uses `IQR <https://en.wikipedia.org/wiki/Interquartile_range#Outliers>`_ to detect outliers out of the
single dimension properties.
Parameters
----------
properties : List[Dict[str, Any]], default: None
List of properties. Replaces the default deepchecks properties.
Each property is a dictionary with keys ``'name'`` (str), ``method`` (Callable) and ``'output_type'`` (str),
representing attributes of said method. 'output_type' must be one of:
- ``'numerical'`` - for continuous ordinal outputs.
- ``'categorical'`` - for discrete, non-ordinal outputs. These can still be numbers,
but these numbers do not have inherent value.
- ``'class_id'`` - for properties that return the class_id. This is used because these
properties are later matched with the ``VisionData.label_map``, if one was given.
For more on image / label properties, see the guide about :ref:`vision__properties_guide`.
property_input_type : PropertiesInputType , default: PropertiesInputType.IMAGES
The type of input to the properties, required for caching the results after first calculation.
n_show_top : int , default: 3
number of outliers to show from each direction (upper limit and bottom limit)
iqr_percentiles : Tuple[int, int] , default: (25, 75)
Two percentiles which define the IQR range
iqr_scale : float, default : 1.5
The scale to multiply the IQR range for the outliers detection
draw_label_on_image : bool, default : True
Whether to draw the label on the image displayed or not.
min_samples : int , default: 10
Minimum number of samples required to calculate IQR. If there are not enough non-null samples a specific
property, the check will skip it. If all properties are skipped, the check will raise a NotEnoughSamplesError.
"""
def __init__(self,
properties_list: t.List[t.Dict[str, t.Any]] = None,
property_input_type: PropertiesInputType = PropertiesInputType.IMAGES,
n_show_top: int = 3,
iqr_percentiles: t.Tuple[int, int] = (25, 75),
iqr_scale: float = 1.5,
draw_label_on_image: bool = True,
min_samples: int = 10,
n_samples: t.Optional[int] = 10000,
**kwargs):
super().__init__(**kwargs)
self.properties_list = properties_list
self.property_input_type = property_input_type
self.iqr_percentiles = iqr_percentiles
self.iqr_scale = iqr_scale
self.n_show_top = n_show_top
self.min_samples = min_samples
self.n_samples = n_samples
self._draw_label_on_image = draw_label_on_image
self._properties_results = None
def initialize_run(self, context: Context, dataset_kind: DatasetKind):
"""Initialize the properties state."""
data = context.get_data_by_kind(dataset_kind)
self._properties_results = defaultdict(list)
# Dict of properties names to a dict of containing keys of property values, images
self._lowest_property_value_images = defaultdict(list)
self._highest_property_value_images = defaultdict(list)
self._images_uuid = []
self.properties_list = self.properties_list if self.properties_list else self.get_default_properties(data)
if self.properties_list is not None and any(p['output_type'] == 'class_id' for p in self.properties_list):
warnings.warn('Properties that have class_id as output_type will be skipped.')
self.properties_list = [p for p in self.properties_list if p['output_type'] != 'class_id']
def update(self, context: Context, batch: BatchWrapper, dataset_kind: DatasetKind):
"""Aggregate image properties from batch."""
batch_properties = batch.vision_properties(self.properties_list, self.property_input_type)
data = context.get_data_by_kind(dataset_kind)
for prop_name, property_values in batch_properties.items():
_ensure_property_shape(property_values, len(batch), prop_name)
# If the property or label is single value per image, wrap them in order to work on a fixed structure
if batch.numpy_labels is not None and data.task_type == TaskType.CLASSIFICATION:
labels = [[label_per_image] for label_per_image in batch.numpy_labels]
else:
labels = batch.numpy_labels
self._images_uuid += batch.numpy_image_identifiers
self._cache_property_values_and_images(batch.numpy_images, labels, list(property_values), prop_name)
def compute(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
"""Compute final result."""
data = context.get_data_by_kind(dataset_kind)
check_result = {}
self._images_uuid = np.asarray(self._images_uuid)
if all(len(np.hstack(v).squeeze()) < self.min_samples for v in self._properties_results.values()):
raise NotEnoughSamplesError(f'Need at least {self.min_samples} non-null samples to calculate IQR outliers.')
for name, values in self._properties_results.items():
values_lengths_cumsum = np.cumsum(np.array([len(v) for v in values]))
values_arr = np.hstack(values).astype(float).squeeze()
values_arr = np.array([x for x in values_arr if pd.notnull(x)])
if len(values_arr) < self.min_samples:
check_result[name] = 'Not enough non-null samples to calculate outliers.'
continue
lower_limit, upper_limit = iqr_outliers_range(values_arr, self.iqr_percentiles, self.iqr_scale)
outlier_values_idx = np.argwhere((values_arr < lower_limit) | (values_arr > upper_limit)).squeeze(axis=1)
outlier_img_idx = np.unique([_sample_index_from_flatten_index(values_lengths_cumsum, outlier_index)
for outlier_index in outlier_values_idx])
outlier_img_identifiers = self._images_uuid[outlier_img_idx] if len(outlier_img_idx) > 0 else []
check_result[name] = {
'outliers_identifiers': outlier_img_identifiers,
'lower_limit': max(lower_limit, min(values_arr)),
'upper_limit': min(upper_limit, max(values_arr)),
}
# Create display
if context.with_display:
display = []
no_outliers = pd.Series([], dtype='str')
for property_name, info in check_result.items():
# If info is string it means there was error
if isinstance(info, str):
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[info])])
elif len(info['outliers_identifiers']) == 0:
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=['No outliers found.'])])
else:
# Create id of alphabetic characters
images_and_values = self._get_property_outlier_images(property_name,
info['lower_limit'], info['upper_limit'],
data)
sid = ''.join([choice(string.ascii_uppercase) for _ in range(6)])
values_combine = ''.join([f'<div class="{sid}-item">{format_number(x[0])}</div>'
for x in images_and_values])
images_combine = ''.join([f'<div class="{sid}-item">{x[1]}</div>'
for x in images_and_values])
html = HTML_TEMPLATE.format(
prop_name=property_name,
values=values_combine,
images=images_combine,
count=len(info['outliers_identifiers']),
n_of_images=len(images_and_values),
lower_limit=format_number(info['lower_limit']),
upper_limit=format_number(info['upper_limit']),
id=sid
)
display.append(html)
display = [''.join(display)]
if not no_outliers.empty:
grouped = no_outliers.groupby(level=0).unique().str.join(', ')
grouped_df = pd.DataFrame(grouped, columns=['Properties'])
grouped_df['More Info'] = grouped_df.index
grouped_df = grouped_df[['More Info', 'Properties']]
display.append('<h5><b>Properties With No Outliers Found</h5></b>')
display.append(grouped_df.style.hide(axis='index') if hasattr(grouped_df.style, 'hide') else
grouped_df.style.hide_index())
else:
display = None
return CheckResult(check_result, display=display)
def _get_property_outlier_images(self, prop_name: str, lower_limit: float, upper_limit: float,
vision_data) -> t.List[t.Tuple[float, str]]:
"""Get outlier images and their values for provided property."""
result = []
for idx, value in enumerate(self._lowest_property_value_images[prop_name]['property_values']):
value = value[0] if isinstance(value, t.Sequence) else value # for property per bbox, value is a list
if value < lower_limit:
image_thumbnail = draw_image(image=self._lowest_property_value_images[prop_name]['images'][idx],
label=self._lowest_property_value_images[prop_name]['labels'][idx],
task_type=vision_data.task_type, draw_label=self._draw_label_on_image,
label_map=vision_data.label_map)
result.append((value, image_thumbnail))
for idx, value in enumerate(self._highest_property_value_images[prop_name]['property_values']):
value = value[0] if isinstance(value, t.Sequence) else value # for property per bbox, value is a list
if value > upper_limit:
image_thumbnail = draw_image(
image=self._highest_property_value_images[prop_name]['images'][idx],
label=self._highest_property_value_images[prop_name]['labels'][idx],
task_type=vision_data.task_type, draw_label=self._draw_label_on_image,
label_map=vision_data.label_map)
result.append((value, image_thumbnail))
return result
@abstractmethod
def get_default_properties(self, data: VisionData):
"""Return default properties to run in the check."""
pass
def _cache_property_values_and_images(self, images: t.List, labels: t.List, property_values: t.List,
property_name: str):
"""Update the _lowest_property_value_images, _lowest_property_value_images dicts based on new batch."""
is_property_per_label = isinstance(property_values[0], (np.ndarray, t.Sequence))
# Update full property values cache for outlier calculation
if is_property_per_label:
self._properties_results[property_name].extend(property_values)
else:
self._properties_results[property_name].extend([[x] for x in property_values])
# In case there are no images or no labels put none instead and do not display images / labels
images = [None] * len(property_values) if images is None else images
if labels is None:
labels = [[None]] * len(property_values) if is_property_per_label else [None] * len(property_values)
# adds the current lowest and highest property value images/labels/values to the batch before sorting
if property_name in self._lowest_property_value_images:
for stored_values_dict in (self._lowest_property_value_images[property_name],
self._highest_property_value_images[property_name]):
labels = stored_values_dict['labels'] + labels
images = stored_values_dict['images'] + images
property_values = stored_values_dict['property_values'] + property_values
if is_property_per_label: # if property is per label flatten the list of lists to find lowest and highest
values_lengths_cumsum = np.cumsum(np.array([len(v) for v in property_values]))
property_values = np.hstack(property_values).astype(float)
labels = np.asarray([item for sublist in labels for item in sublist], dtype='object')
# calculate lowest and highest property values
not_null_indices = [idx for idx, value in enumerate(property_values) if value is not None]
if len(not_null_indices) <= self.n_show_top:
lowest_values_idx = not_null_indices
highest_values_idx = not_null_indices
else:
lowest_values_idx = np.argpartition([np.inf if v is None else v for v in property_values],
self.n_show_top)[:self.n_show_top]
highest_values_idx = np.argpartition([np.NINF if v is None else v for v in property_values],
-self.n_show_top)[-self.n_show_top:]
if is_property_per_label:
lowest_img_idx = [_sample_index_from_flatten_index(values_lengths_cumsum, x) for x in lowest_values_idx]
highest_img_idx = [_sample_index_from_flatten_index(values_lengths_cumsum, x) for x in highest_values_idx]
else:
lowest_img_idx = lowest_values_idx
highest_img_idx = highest_values_idx
self._lowest_property_value_images[property_name] = \
{'images': [images[x] for x in lowest_img_idx],
'property_values': [[property_values[x]] if is_property_per_label else property_values[x]
for x in lowest_values_idx],
'labels': [[labels[x]] if is_property_per_label else labels[x] for x in lowest_values_idx]}
self._highest_property_value_images[property_name] = \
{'images': [images[x] for x in highest_img_idx],
'property_values': [[property_values[x]] if is_property_per_label else property_values[x]
for x in highest_values_idx],
'labels': [[labels[x]] if is_property_per_label else labels[x] for x in highest_values_idx]}
def _ensure_property_shape(property_values, data_len, prop_name):
"""Validate the result of the property."""
if len(property_values) != data_len:
raise DeepchecksProcessError(f'Properties are expected to return value per image but instead got'
f' {len(property_values)} values for {data_len} images for property '
f'{prop_name}')
# If the first item is list validate all items are list of numbers
if isinstance(property_values[0], t.Sequence):
if any((not isinstance(x, t.Sequence) for x in property_values)):
raise DeepchecksProcessError(f'Property result is expected to be either all lists or all scalars but'
f' got mix for property {prop_name}')
if any((not _is_list_of_numbers(x) for x in property_values)):
raise DeepchecksProcessError(f'For outliers, properties are expected to be only numeric types but'
f' found non-numeric value for property {prop_name}')
# If first value is not list, validate all items are numeric
elif not _is_list_of_numbers(property_values):
raise DeepchecksProcessError(f'For outliers, properties are expected to be only numeric types but'
f' found non-numeric value for property {prop_name}')
def _is_list_of_numbers(l):
return not any(i is not None and not isinstance(i, Number) for i in l)
def _sample_index_from_flatten_index(cumsum_lengths, flatten_index) -> int:
# The cumulative sum lengths is holding the cumulative sum of properties per image, so the first index which value
# is greater than the flatten index, is the image index.
# for example if the sums lengths is [1, 6, 11, 13, 16, 20] and the flatten index = 6, it means this property
# belong to the third image which is index = 2.
return np.argwhere(cumsum_lengths > flatten_index)[0][0]
NO_IMAGES_TEMPLATE = """
<h3><b>Property "{prop_name}"</b></h3>
<div>{message}</div>
"""
HTML_TEMPLATE = """
<style>
.{id}-container {{
overflow-x: auto;
display: flex;
flex-direction: column;
gap: 10px;
}}
.{id}-row {{
display: flex;
flex-direction: row;
align-items: center;
gap: 10px;
}}
.{id}-item {{
display: flex;
min-width: 200px;
position: relative;
word-wrap: break-word;
align-items: center;
justify-content: center;
}}
.{id}-title {{
font-family: "Open Sans", verdana, arial, sans-serif;
color: #2a3f5f
}}
/* A fix for jupyter widget which doesn't have width defined on HTML widget */
.widget-html-content {{
width: -moz-available; /* WebKit-based browsers will ignore this. */
width: -webkit-fill-available; /* Mozilla-based browsers will ignore this. */
width: fill-available;
}}
</style>
<h5><b>Property "{prop_name}"</b></h5>
<div>
Total number of outliers: {count}
</div>
<div>
Non-outliers range: {lower_limit} to {upper_limit}
</div>
<div class="{id}-container">
<div class="{id}-row">
<h5 class="{id}-item">{prop_name}</h5>
{values}
</div>
<div class="{id}-row">
<h5 class="{id}-item">Image</h5>
{images}
</div>
</div>
"""