/
datasets_size_comparison.py
127 lines (104 loc) · 4.6 KB
/
datasets_size_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Datasets size comparision check module."""
import typing as t
import pandas as pd
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.tabular import Context, TrainTestCheck
__all__ = ['DatasetsSizeComparison']
from deepchecks.utils.strings import format_number
T = t.TypeVar('T', bound='DatasetsSizeComparison')
class DatasetsSizeComparison(TrainTestCheck):
"""Verify test dataset size comparing it to the train dataset size."""
def run_logic(self, context: Context) -> CheckResult:
"""Run check.
Returns
-------
CheckResult
with value of type pandas.DataFrame.
Value contains two keys, 'train' - size of the train dataset
and 'test' - size of the test dataset.
Raises
------
DeepchecksValueError
if not dataset instances were provided.
if datasets are empty.
"""
train_dataset = context.train
test_dataset = context.test
sizes = {'Train': len(train_dataset), 'Test': len(test_dataset)}
display = pd.DataFrame(sizes, index=['Size'])
return CheckResult(
value=sizes,
display=display
)
def add_condition_test_size_greater_or_equal(self: T, value: int = 100) -> T:
"""Add condition verifying that size of the test dataset is greater or equal to threshold.
Parameters
----------
value : int , default: 100
minimal allowed test dataset size.
Returns
-------
Self
current instance of the DatasetsSizeComparison check.
"""
def condition(check_result: dict) -> ConditionResult:
details = f'Test dataset contains {check_result["Test"]} samples'
category = ConditionCategory.FAIL if check_result['Test'] <= value else ConditionCategory.PASS
return ConditionResult(category, details)
return self.add_condition(
name=f'Test dataset size is greater or equal to {value}',
condition_func=condition
)
def add_condition_test_train_size_ratio_greater_than(self: T, ratio: float = 0.01) -> T:
"""Add condition verifying that test-train size ratio is greater than threshold.
Parameters
----------
ratio : float , default: 0.01
minimal allowed test-train ratio.
Returns
-------
Self
current instance of the DatasetsSizeComparison check.
"""
def condition(check_result: dict) -> ConditionResult:
test_train_ratio = check_result['Test'] / check_result['Train']
details = f'Test-Train size ratio is {format_number(test_train_ratio)}'
category = ConditionCategory.PASS if test_train_ratio > ratio else ConditionCategory.FAIL
return ConditionResult(category, details)
return self.add_condition(
name=f'Test-Train size ratio is greater than {ratio}',
condition_func=condition
)
def add_condition_train_dataset_greater_or_equal_test(self: T) -> T:
"""Add condition verifying that train dataset is greater than test dataset.
Returns
-------
Self
current instance of the DatasetsSizeComparison check.
"""
def condition(check_result: dict) -> ConditionResult:
diff = check_result['Train'] - check_result['Test']
if diff < 0:
details = f'Train dataset is smaller than test dataset by {diff} samples'
category = ConditionCategory.FAIL
elif diff == 0:
details = f'Train and test datasets both have {check_result["Train"]} samples'
category = ConditionCategory.PASS
else:
details = f'Train dataset is larger than test dataset by +{diff} samples'
category = ConditionCategory.PASS
return ConditionResult(category, details)
return self.add_condition(
name='Train dataset is greater or equal to test dataset',
condition_func=condition
)