-
Notifications
You must be signed in to change notification settings - Fork 0
/
identifiers.py
373 lines (317 loc) · 13.4 KB
/
identifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
'''
This module is for DiffChecker class.
'''
import sys
import os
import logging
from importlib import reload
import pickle
import pandas as pd
import numpy as np
sys.path.append('../')
from mlqa import checkers as ch
class DiffChecker():
'''Integrated QA performer on pd.DataFrame with logging functionality.
It only works in numerical columns.
Args:
qa_level (str): quick set for QA level, can be one of ['loose', 'mid', 'strict']
logger (str or logging.Logger): 'print' for print only, every other
str creates a file for logging. using external logging.Logger object
is highly recommended, i.e. logger=<mylogger>.
qa_log_level (int): qa message logging level
log_info (bool): `True` if method calls or arguments also need to be
logged
Notes:
Although `DiffChecker <identifiers.html#identifiers.DiffChecker>`_ is
able to create a `Logger <https://docs.python.org/3/library/logging.html#logging.Logger>`_
object by just passing a file name (i.e. `logger='mylog.log'`), creating
the `Logger <https://docs.python.org/3/library/logging.html#logging.Logger>`_
object externally then passing accordingly (i.e. `logger=<mylogger>`)
is highly recommended.
Example:
Basic usage:
>>> dc = DiffChecker()
>>> dc.fit(pd.DataFrame({'mean_col':[1, 2]*50, 'na_col':[None]*50+[1]*50}))
>>> dc.check(pd.DataFrame({'mean_col':[.99, 2.1]*50, 'na_col':[None]*70+[1]*30}))
True
>>> dc.set_threshold(0.1)
>>> dc.check(pd.DataFrame({'mean_col':[.99, 2.1]*50, 'na_col':[None]*70+[1]*30}))
False
Quick set for `qa_level`:
>>> dc = DiffChecker()
>>> dc.threshold
0.5
>>> dc = DiffChecker(qa_level='mid')
>>> dc.threshold
0.2
>>> dc = DiffChecker(qa_level='strict')
>>> dc.threshold
0.1
Logger can also be initiated:
>>> dc = DiffChecker(logger='mylog.log')
>>> dc.fit(pd.DataFrame({'mean_col':[1, 2]*50, 'na_col':[None]*50+[1]*50}))
>>> dc.set_threshold(0.1)
>>> dc.check(pd.DataFrame({'mean_col':[1, 1.5]*50, 'na_col':[None]*70+[1]*30}))
False
'''
stats = []
threshold = 0.0
threshold_df = pd.DataFrame()
df_fit_stats = pd.DataFrame()
def __init__(
self,
qa_level='loose',
logger=None,
qa_log_level=None,
log_info=False
):
# Class logger reloads logging module in each call not to create
# conflict, this is okay as long as this is the only logger in the
# environment. Having external logger is highly recommended in all
# other cases.
if logger == 'print':
logging.shutdown()
reload(logging)
logging.basicConfig(
format='%(asctime)-15s %(message)s',
level='DEBUG')
self.logger = logging.getLogger('DiffCheckerLogIdToPrint')
elif isinstance(logger, str):
logging.shutdown()
reload(logging)
handler = logging.FileHandler(logger, mode='w+')
handler.setFormatter(logging.Formatter(
fmt='%(levelname)s|%(asctime)s|%(message)s'))
self.logger = logging.getLogger('DiffCheckerLogIdToDump')
self.logger.setLevel(logging.DEBUG)
self.logger.addHandler(handler)
else:
# if external logger provided
self.logger = logger
self.log_level = qa_log_level or 30
self.log_info = log_info
qa_levels = {
'loose':{
'stats':['mean', ch.na_rate],
'threshold':.5
},
'mid':{
'stats':['mean', 'std', ch.na_rate],
'threshold':.2
},
'strict':{
'stats':['mean', 'std', 'count', 'min', 'max', ch.na_rate],
'threshold':.1
}
}
if qa_level not in qa_levels.keys():
raise ValueError('`qa_level` not right, choose one of {}'\
.format(qa_levels.keys()))
self.set_stats(qa_levels[qa_level]['stats'])
self.set_threshold(qa_levels[qa_level]['threshold'])
def set_stats(self, funcs):
'''Sets statistic functions list to check by.
Args:
funcs (list): list of functions and/or function names,
e.g. [np.sum, 'mean']
See Also:
`add_stat <#identifiers.DiffChecker.add_stat>`_: just to add one
'''
if not self.df_fit_stats.empty:
raise ValueError('self.stats cannot be altered after `fit()` call')
if not isinstance(funcs, list):
raise TypeError('`funcs` must be a list')
self._method_init_logger(locals())
self.stats = funcs
def add_stat(self, func):
'''Appends a statistic function into the existing list (i.e. `stats <#identifiers.DiffChecker.stats>`_).
Args:
func (func): function name (e.g. np.sum or 'mean')
See Also:
`set_stats <#identifiers.DiffChecker.set_stats>`_: to reset all
'''
if not self.df_fit_stats.empty:
raise ValueError('self.stats cannot be altered after `fit()` call')
if not (isinstance(func, str) or callable(func)):
raise TypeError('`func` must be str or callable')
if func in self.stats:
raise ValueError('`func` is already in `self.stats`')
self._method_init_logger(locals())
self.stats.append(func)
def set_threshold(self, threshold):
'''Sets threshold for statistic-column pairs.
Args:
threshold (float or dict): can be used to set for all or column
statistic pairs.
Example:
>>> dc = DiffChecker()
>>> dc.set_stats(['mean', 'max'])
>>> dc.set_threshold(0.1) # to reset all thresholds
>>> print(dc.threshold)
0.1
>>> dc.fit(pd.DataFrame({'col1':[1, 2, 3, 4], 'col2':[0]*4}))
>>> dc.set_threshold({'col1':0.2, 'col2':0.1}) # to set in column level
>>> print(dc.threshold_df)
col1 col2
mean 0.2 0.1
max 0.2 0.1
>>> dc.set_threshold({'col1':{'mean':0.3}}) # to set in column-stat level
>>> print(dc.threshold_df)
col1 col2
mean 0.3 0.1
max 0.2 0.1
'''
self._method_init_logger(locals())
if isinstance(threshold, dict):
if self.df_fit_stats.empty:
raise ValueError('call `fit()` first for column level threshold')
for col, v1 in threshold.items():
if col not in self.df_fit_stats.columns:
raise ValueError('{} not found in fitted DataFrame'\
.format(col))
if isinstance(v1, dict):
for stat, v2 in v1.items():
if stat not in self.df_fit_stats.index:
raise ValueError(
"'{0}' not set as stat, available stats are {1}"\
.format(stat, self.df_fit_stats.index.tolist()))
th = float(v2)
assert th >= 0
self.threshold_df.loc[stat, col] = th
else:
th = float(v1)
assert th >= 0
self.threshold_df.loc[:, col] = th
else:
th = float(threshold)
assert th >= 0
self.threshold = th
def fit(self, df):
'''Fits given `df`.
Based on given `df` and `stats <#identifiers.DiffChecker.stats>`_ attribute, this method constructs
`df_fit_stats <#identifiers.DiffChecker.df_fit_stats>`_ attribute to store column statistics. This is later to
be used by `check <#identifiers.DiffChecker.check>`_ method. Only works
in numerical columns.
Args:
df (pd.DataFrame): data to be fit
Example:
>>> dc = DiffChecker()
>>> dc.set_stats(['mean', 'max'])
>>> dc.fit(pd.DataFrame({'col1':[1, 2, 3, 4], 'col2':[0]*4}))
>>> print(dc.df_fit_stats)
col1 col2
mean 2.5 0.0
max 4.0 0.0
'''
assert isinstance(self.stats, list) and len(self.stats) >= 1
if not isinstance(df, pd.DataFrame):
raise TypeError('`df` must be a pd.DataFrame')
self._method_init_logger(locals())
self.df_fit_stats = pd.DataFrame()
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
for stat in self.stats:
if isinstance(stat, str):
stat_name = stat
else:
stat_name = stat.__name__
self.df_fit_stats.loc[stat_name, col] = df[col].agg(stat)
self.threshold_df = self.df_fit_stats.copy()
self.threshold_df.loc[:, :] = np.NaN
def check(self, df_to_check, columns=None, columns_to_exclude=None):
'''Checks given `df_to_check` based on fitted `df` stats.
For each column stat pairs, it checks if stat is in given threshold by
utilizing `qa_array_statistics <checkers.html#checkers.qa_array_statistics>`_.
If any stat qa fails, returns `False`, `True otherwise`.
Args:
df_to_check (pd.DataFrame): data to check
columns (None or list): if given, only these columns will be
considered for qa
columns_to_exclude (None or list): columns to exclude from qa
Returns:
bool: is QA passed or not
Example:
>>> dc = DiffChecker()
>>> dc.set_threshold(0.2)
>>> dc.set_stats(['mean', 'max', np.sum])
>>> dc.fit(pd.DataFrame({'col1':[1, 2, 3, 4], 'col2':[1]*4}))
>>> dc.check(pd.DataFrame({'col1':[1, 2, 3, 4], 'col2':[0]*4}))
False
>>> dc.check(pd.DataFrame({'col1':[1, 2.1, 3.2, 4.2], 'col2':[1.1]*4}))
True
'''
assert isinstance(self.stats, list) and len(self.stats) >= 1
if not isinstance(df_to_check, pd.DataFrame):
raise TypeError('`df_to_check` must be a pd.DataFrame')
if columns is not None and columns_to_exclude is not None:
raise ValueError('only one must be given, '
'`columns` or `columns_to_exclude`')
if columns is not None:
if not isinstance(columns, list):
raise TypeError('`columns` must be a list')
if columns_to_exclude is not None:
if not isinstance(columns_to_exclude, list):
raise TypeError('`columns_to_exclude` must be a list')
self._method_init_logger(locals())
cols_to_check = self.df_fit_stats.columns.tolist()
if columns:
cols_to_check = list(set(cols_to_check) & set(columns))
if columns_to_exclude:
cols_to_check = [c for c in cols_to_check if c not \
in columns_to_exclude]
qa_results = []
for col in cols_to_check:
for stat in self.stats:
if isinstance(stat, str):
stat_name = stat
else:
stat_name = stat.__name__
th = self.threshold_df.loc[stat_name, col]
th = self.threshold if pd.isna(th) else th
val = self.df_fit_stats.loc[stat_name, col]
tol = abs(val)*th
ll, ul = val-tol, val+tol
result = ch.qa_array_statistics(
df_to_check[col],
{stat:[ll, ul]},
logger=self.logger,
log_level=self.log_level,
name=col)
qa_results.append(result)
return all(qa_results)
def to_pickle(self, path='DiffChecker.pkl'):
'''Pickle (serialize) object to a file.
Args:
path (str): file path where the pickled object will be stored
Example:
To save a `*.pkl` file:
>>> dc1 = DiffChecker()
>>> dc1.fit(pd.DataFrame({'col1':[1, 2, 3, 4], 'col2':[0]*4}))
>>> dc1.to_pickle(path='DiffChecker.pkl')
To load the same object later:
>>> import pickle
>>> pkl_file = open('DiffChecker.pkl', 'rb')
>>> dc2 = pickle.load(pkl_file)
>>> pkl_file.close()
>>> os.remove('DiffChecker.pkl')
'''
self._method_init_logger(locals())
self.logger = None
output = open(path, 'wb')
pickle.dump(self, output, -1)
output.close()
def _method_init_logger(self, args, exclude=['self']):
'''Logs method initiation with given arguments.
Args:
args (dict): local arguments, i.e. `locals()`
exclude (list): arguments to exclude, e.g. `self`
'''
if self.logger and self.log_info:
method_name = sys._getframe(1).f_code.co_name
self.logger.info("{} initiated.".format(method_name))
for k, v in args.items():
if k not in exclude:
self.logger.info(method_name+' locals: '+k+'='+str(v)[:100])
if __name__ == "__main__":
import doctest
doctest.testmod()