/
conflicting_labels.py
141 lines (122 loc) · 5.56 KB
/
conflicting_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""module contains Data Duplicates check."""
from typing import List, Union
import pandas as pd
from typing_extensions import TypedDict
from deepchecks.core import CheckResult
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.utils.abstracts.conflicting_labels import ConflictingLabelsAbstract
from deepchecks.utils.dataframes import cast_categorical_to_object_dtype
from deepchecks.utils.strings import format_list
from deepchecks.utils.typing import Hashable
__all__ = ['ConflictingLabels']
class ResultValue(TypedDict):
percent_of_conflicting_samples: float
samples_indices: List[List[int]]
class ConflictingLabels(SingleDatasetCheck, ConflictingLabelsAbstract):
"""Find samples which have the exact same features' values but different labels.
Parameters
----------
columns : Union[Hashable, List[Hashable]] , default: None
List of columns to check, if none given checks
all columns Except ignored ones.
ignore_columns : Union[Hashable, List[Hashable]] , default: None
List of columns to ignore, if none given checks
based on columns variable.
n_to_show : int , default: 5
number of most common ambiguous samples to show.
n_samples : int , default: 10_000_000
number of samples to use for this check.
random_state : int, default: 42
random seed for all check internals.
"""
def __init__(
self,
columns: Union[Hashable, List[Hashable], None] = None,
ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_to_show: int = 5,
n_samples: int = 10_000_000,
random_state: int = 42,
**kwargs
):
super().__init__(**kwargs)
self.columns = columns
self.ignore_columns = ignore_columns
self.n_to_show = n_to_show
self.n_samples = n_samples
self.random_state = random_state
def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check.
Returns
-------
CheckResult
Value of result is a dictionary that contains percentage of ambiguous samples
and list of samples with confliction labels. Display shows 'n_to_show' most
ambiguous labels with their samples.
"""
dataset = context.get_data_by_kind(dataset_kind)
context.assert_classification_task()
dataset = dataset.sample(self.n_samples, random_state=self.random_state
).select(self.columns, self.ignore_columns, keep_label=True)
features = dataset.features
label_name = dataset.label_name
# HACK:
# pandas have bug with groupby on category dtypes,
# so until it fixed, change dtypes manually
df = cast_categorical_to_object_dtype(dataset.data.copy())
# Get index in order to use in the display
index_col_name = '_dc_index'
df[index_col_name] = df.index
# Group by features
group_unique_data = df.groupby(features, dropna=False).agg(list)
# Calculate count per feature-group
group_unique_data['count'] = group_unique_data[index_col_name].apply(len)
# Sort by count
group_unique_data = group_unique_data.sort_values(by='count', ascending=False)
num_ambiguous = 0
ambiguous_label_name = 'Observed Labels'
indices_name = 'Instances'
samples = []
display_samples = []
for group_index, group_data in group_unique_data.iterrows():
ambiguous_labels = set(group_data[label_name])
if len(ambiguous_labels) == 1:
continue
num_ambiguous += group_data['count']
samples.append(group_data[index_col_name])
if context.with_display is True:
index_names = group_unique_data.index.names
# In case of single feature the group_index is not a list so convert it
group_index = group_index if isinstance(group_index, list) else [group_index]
display_sample = dict(zip(index_names, group_index))
# Using tuple since it's hashable
display_sample[ambiguous_label_name] = tuple(ambiguous_labels)
display_sample[indices_name] = format_list(group_data[index_col_name])
display_samples.append(display_sample)
if len(display_samples) == 0:
display = None
else:
display = pd.DataFrame.from_records(display_samples[:self.n_to_show])
display.set_index([ambiguous_label_name, indices_name], inplace=True)
display = [
'Each row in the table shows an example of a data sample '
'and the its observed labels as found in the dataset. '
f'Showing top {display.shape[0]} of {len(display_samples)}',
display
]
return CheckResult(
display=display,
value=ResultValue(
percent_of_conflicting_samples=num_ambiguous / dataset.n_samples,
samples_indices=samples,
)
)