-
Notifications
You must be signed in to change notification settings - Fork 246
/
under_annotated_segments.py
308 lines (266 loc) · 17.2 KB
/
under_annotated_segments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module of the under annotated segments check."""
from typing import Dict, List, Tuple, Union
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from deepchecks import ConditionCategory, ConditionResult
from deepchecks.core import CheckResult
from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
from deepchecks.utils.abstracts.weak_segment_abstract import WeakSegmentAbstract
from deepchecks.utils.strings import format_percent
from deepchecks.utils.typing import Hashable
__all__ = ['UnderAnnotatedMetaDataSegments', 'UnderAnnotatedPropertySegments']
MAX_SAMPLES_IN_FIGURE = 1000
class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
"""Check for under annotated data segments."""
def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
segment_minimum_size_ratio: float, n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
super().__init__(**kwargs)
self.segment_by = segment_by
self.columns = columns
self.ignore_columns = ignore_columns
self.n_top_features = n_top_features
self.segment_minimum_size_ratio = segment_minimum_size_ratio
self.n_samples = n_samples
self.n_to_show = n_to_show
self.categorical_aggregation_threshold = categorical_aggregation_threshold
def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)
text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)
features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
columns=self.columns, ignore_columns=self.ignore_columns,
n_top_features=self.n_top_features)
score_per_sample = pd.Series([1 - pd.isna(x) for x in text_data.label], index=features.index)
encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,
cat_features)
avg_score = round(score_per_sample.mean(), 3)
weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
score_per_sample=score_per_sample,
scorer_name='Annotation Ratio')
if len(weak_segments) == 0:
raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
'your data is well annotated. If this is not the case, try increasing '
f'n_samples or supply more {self.segment_by}.')
check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
display_msg = f'Showcasing intersections of {self.segment_by} that result in the most ' \
f'under annotated segments.<br> The ' \
'full list of under annotated segments can be observed in the check result value. '
display_fig = self._generate_scatter_plot_display(encoded_dataset.features_columns, score_per_sample,
text_data.text, weak_segments, avg_score)
return CheckResult(value=check_result_value, display=[display_msg, display_fig])
@staticmethod
def _get_box_boundaries(feature_data: pd.Series, segment: Tuple[float, float]) -> Tuple[float, float]:
lower = segment[0] if np.isfinite(segment[0]) else np.nanmin(feature_data)
upper = segment[1] if np.isfinite(segment[1]) else np.nanmax(feature_data)
return lower, upper
def _generate_scatter_plot_display(self, encoded_data: pd.DataFrame, is_annotated: pd.Series,
text: np.ndarray, weak_segments: pd.DataFrame, avg_score: float) -> DisplayMap:
display_tabs = {}
if weak_segments.shape[0] > self.n_to_show:
weak_segments = weak_segments.iloc[:self.n_to_show, :]
encoded_data['text'] = text
# Handle categorical features
jitter = 0.25
for feature in self.encoder_mapping.keys():
encoded_data[feature] = encoded_data[feature] + np.random.uniform(-jitter, jitter, len(encoded_data))
# virtual col is used when we have only one feature controlling the segment
encoded_data['virtual_col'] = np.random.uniform(-jitter, jitter, len(encoded_data))
sampled_data = encoded_data.sample(MAX_SAMPLES_IN_FIGURE, random_state=42)
annotated_data = sampled_data[is_annotated == 1]
not_annotated_data = sampled_data[is_annotated == 0]
for _, row in weak_segments.iterrows():
fig = go.Figure()
feature_1, feature_2 = row['Feature1'], row['Feature2']
feature_1_lower, feature_1_upper = self._get_box_boundaries(encoded_data[feature_1], row['Feature1 Range'])
if feature_2 != '': # segment by two features
feature_2_lower, feature_2_upper = self._get_box_boundaries(encoded_data[feature_2],
row['Feature2 Range'])
hover_template = '<b>' + feature_1 + '<b>: %{x}<br><b>' + feature_2 + \
'<b>: %{y}<br><b>text<b>: %{text}<br><b>Annotated<b>: '
tab_title = f'{feature_1} vs {feature_2}'
range_f1 = self._format_partition_vec_for_display([feature_1_lower, feature_1_upper], feature_1, ', ')
range_f2 = self._format_partition_vec_for_display([feature_2_lower, feature_2_upper], feature_2, ', ')
msg = f'Under annotated segment contains samples with {feature_1} in ' \
f'{range_f1[0]} and {feature_2} in {range_f2[0]}.'
else: # segment by one feature
feature_2 = 'virtual_col'
feature_2_lower = encoded_data['virtual_col'].min() * 1.3
feature_2_upper = encoded_data['virtual_col'].max() * 1.3
hover_template = '<b>' + feature_1 + '<b>: %{x}<br><b>text<b>: %{text}<br><b>Annotated<b>: '
tab_title = feature_1
range_f1 = self._format_partition_vec_for_display([feature_1_lower, feature_1_upper], feature_1, ', ')
msg = f'Under annotated segment contains samples with {feature_1} in {range_f1[0]}.'
fig.update_yaxes(range=[feature_2_lower * 1.2, feature_2_upper * 1.2], tickvals=[], ticktext=[])
# Add box trace to the legend using lines
fig.add_trace(go.Scatter(
x=[feature_1_lower, feature_1_lower, feature_1_upper, feature_1_upper, feature_1_lower],
y=[feature_2_lower, feature_2_upper, feature_2_upper, feature_2_lower, feature_2_lower],
mode='lines',
line=dict(color='rgb(121, 100, 255)', width=3, dash='dot'),
name='Under Annotated Segment',
fill='toself', fillcolor='rgb(121, 100, 255)', opacity=0.4,
legendgroup='box'))
# Add annotated scatter plot
fig.add_trace(go.Scatter(x=annotated_data[feature_1], y=annotated_data[feature_2],
mode='markers', opacity=0.7,
marker=dict(symbol='circle', color='lightgreen', size=10,
line=dict(color='black', width=1)),
hovertemplate=hover_template + 'True',
text=annotated_data['text'],
name='Annotated Samples'))
# Add not annotated scatter plot
fig.add_trace(go.Scatter(x=not_annotated_data[feature_1], y=not_annotated_data[feature_2],
mode='markers', opacity=0.7,
marker=dict(symbol='x', color='red', size=10, line=dict(color='black', width=1)),
hovertemplate=hover_template + 'False',
text=not_annotated_data['text'],
name='Not Annotated Samples'))
# Update figure layout
fig.update_layout(title=dict(
text=f'Under Annotated Segment ({row["% of Data"]}% of Data)<br><sub>Annotation ratio in segment '
f'is {format_percent(row["Annotation Ratio"])} (vs. {format_percent(avg_score)}'
f' in whole data)</sub>',
font=dict(size=24)),
xaxis_title=feature_1, yaxis_title=feature_2 if feature_2 != 'virtual_col' else '',
autosize=False, width=1000, height=600,
font=dict(size=14),
plot_bgcolor='rgba(245, 245, 245, 1)',
xaxis=dict(gridcolor='rgba(200, 200, 200, 0.5)',
zerolinecolor='rgba(200, 200, 200, 0.5)'),
yaxis=dict(gridcolor='rgba(200, 200, 200, 0.5)',
zerolinecolor='rgba(200, 200, 200, 0.5)'))
# Use original categorical values in the axis ticks
if feature_1 in self.encoder_mapping.keys():
mapping = self.encoder_mapping[feature_1]
fig.update_xaxes(title=feature_1, tickvals=mapping['encoded_value'].to_list(),
ticktext=mapping['original_category'].to_list())
if feature_2 in self.encoder_mapping.keys():
mapping = self.encoder_mapping[feature_2]
fig.update_yaxes(title=feature_2, tickvals=mapping['encoded_value'].to_list(),
ticktext=mapping['original_category'].to_list())
display_tabs[tab_title] = [fig, f'Check ran on {encoded_data.shape[0]} data samples.<br>{msg}']
return DisplayMap(display_tabs)
def add_condition_segments_annotation_ratio_greater_than(self, threshold: float = 0.70):
"""Add condition - check that the in all segments annotation ratio is above the provided threshold.
Parameters
----------
threshold : float , default: 0.20
maximal ratio of change allowed between the average score and the score of the weakest segment.
"""
def condition(result: Dict) -> ConditionResult:
weakest_segment_score = result['weak_segments_list'].iloc[0, 0]
msg = f'Most under annotated segment has annotation ratio of {format_percent(weakest_segment_score)}.'
if weakest_segment_score > threshold:
return ConditionResult(ConditionCategory.PASS, msg)
else:
return ConditionResult(ConditionCategory.FAIL, msg)
return self.add_condition(f'In all segments annotation ratio should be greater than '
f'{format_percent(threshold)}.', condition)
class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
"""Search for under annotated data segments.
The check is designed to help you easily identify under annotated segments of your data. The segments are
based on the text properties - which are features extracted from the text, such as "language" and
"number of words". For more on properties, see the `NLP Properties Guide
<https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_properties.html>`_.
In order to achieve this, the check trains several simple tree based models which try to predict given a sample
properties whether it will have a label. The relevant segments are detected by analyzing the different
leafs of the trained trees.
Parameters
----------
properties : Union[Hashable, List[Hashable]] , default: None
Properties to check, if none are given checks all properties except ignored ones.
ignore_properties : Union[Hashable, List[Hashable]] , default: None
Properties to ignore, if none given checks based on properties variable
n_top_properties : int , default: 10
Number of properties to use for segment search. Top properties are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
size >= segment_minimum_size_ratio * data_size.
n_samples : int , default: 10_000
Maximum number of samples to use for this check.
n_to_show : int , default: 3
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
"""
def __init__(self,
properties: Union[Hashable, List[Hashable], None] = None,
ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: int = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
**kwargs):
super().__init__(segment_by='properties',
columns=properties,
ignore_columns=ignore_properties,
n_top_features=n_top_properties,
segment_minimum_size_ratio=segment_minimum_size_ratio,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
**kwargs)
class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
"""Search for under annotated data segments.
The check is designed to help you easily identify under annotated segments of your data. The segments are
based on the metadata - which is data that is not part of the text, but is related to it,
such as "user_id" and "user_age". For more on metadata, see the `NLP Metadata Guide
<https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_metadata.html>`_.
In order to achieve this, the check trains several simple tree based models which try to predict given a sample
metadata whether it will have a label. The relevant segments are detected by analyzing the different
leafs of the trained trees.
Parameters
----------
columns : Union[Hashable, List[Hashable]] , default: None
Columns to check, if none are given checks all columns except ignored ones.
ignore_columns : Union[Hashable, List[Hashable]] , default: None
Columns to ignore, if none given checks based on columns variable
n_top_columns : int , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.
segment_minimum_size_ratio: float , default: 0.05
Minimum size ratio for segments. Will only search for segments of
size >= segment_minimum_size_ratio * data_size.
n_samples : int , default: 10_000
Maximum number of samples to use for this check.
n_to_show : int , default: 3
number of segments with the weakest performance to show.
categorical_aggregation_threshold : float , default: 0.05
In each categorical column, categories with frequency below threshold will be merged into "Other" category.
"""
def __init__(self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 10,
segment_minimum_size_ratio: float = 0.05,
n_samples: int = 10_000,
categorical_aggregation_threshold: float = 0.05,
n_to_show: int = 3,
**kwargs):
super().__init__(segment_by='metadata',
columns=columns,
ignore_columns=ignore_columns,
n_top_features=n_top_columns,
segment_minimum_size_ratio=segment_minimum_size_ratio,
n_samples=n_samples,
n_to_show=n_to_show,
categorical_aggregation_threshold=categorical_aggregation_threshold,
**kwargs)