-
Notifications
You must be signed in to change notification settings - Fork 105
/
preprocessing.py
686 lines (587 loc) · 25.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
from joblib import hash
import warnings
from warnings import warn
from dateutil.parser import ParserError
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.validation import check_is_fitted
_FLOAT_REGEX = r"^[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))$"
_FLOAT_MATCHING_CACHE = {}
_MIXED_TYPE_WARNINGS = {}
def _float_matching(X_col, return_safe_col=False):
is_floaty = X_col.str.match(_FLOAT_REGEX)
# things that weren't strings
not_strings = is_floaty.isna()
if not_strings.any():
rest = X_col[not_strings]
all_castable = False
try:
# if we can convert them all to float we're done
rest.astype(float)
is_floaty[not_strings] = True
all_castable = True
except ValueError:
pass
if not all_castable:
if X_col.name not in _MIXED_TYPE_WARNINGS:
warn(f'Mixed types in column {X_col.name}')
_MIXED_TYPE_WARNINGS[X_col.name] = True
# make everything string
rest = rest.astype(str)
rest_is_floaty = _float_matching(rest)
is_floaty[not_strings] = rest_is_floaty
if return_safe_col:
X_col = X_col.copy()
X_col[not_strings] = rest
if not is_floaty.dtype == bool:
is_floaty = is_floaty.astype(bool)
if return_safe_col:
return is_floaty, X_col
else:
return is_floaty
def _float_matching_fetch(X, col, return_safe_col=False):
"""Retrieve _float_matching for X[col] from cache or function call.
If not present in cache, stores function call results into cache.
Uses dataframe object id and column name as cache key.
"""
hash_key = f'{col}-{hash(X[col])}'
if hash_key in _FLOAT_MATCHING_CACHE:
floats, X_col = _FLOAT_MATCHING_CACHE[hash_key]
else:
floats, X_col = _float_matching(X[col], return_safe_col=True)
_FLOAT_MATCHING_CACHE[hash_key] = floats, X_col
if return_safe_col:
return floats, X_col
else:
return floats
class DirtyFloatCleaner(BaseEstimator, TransformerMixin):
# should this error if the inputs are not string?
def fit(self, X, y=None):
# FIXME clean float columns will make this fail
if not isinstance(X, pd.DataFrame):
raise TypeError("X is not a dataframe. Convert or call `clean`.")
encoders = {}
for col in X.columns:
floats, X_col = _float_matching_fetch(X, col, return_safe_col=True)
# FIXME sparse
if (~floats).any():
encoders[col] = OneHotEncoder(sparse=False,
handle_unknown='ignore').fit(
pd.DataFrame(X_col[~floats]))
else:
encoders[col] = None
self.encoders_ = encoders
self.columns_ = X.columns
return self
def transform(self, X):
if (self.columns_ == X.columns).all() is False:
raise ValueError("Given the same columns")
result = []
for col in self.columns_:
floats, X_col = _float_matching_fetch(X, col, return_safe_col=True)
nofloats = ~floats
X_new_col = X_col.copy()
X_new_col[nofloats] = np.NaN
X_new_col = X_new_col.astype(float)
enc = self.encoders_[col]
if enc is None:
if nofloats.any():
warnings.warn(
"Found non-floats {} in float column. It's "
"recommended"
" to call 'clean' on the whole dataset before "
"splitting into training and test set.".format(
X.loc[nofloats, col].unique()))
X_new_col = X_new_col.rename("{}_dabl_continuous".format(col))
result.append(X_new_col)
continue
cats = pd.DataFrame(0, index=X.index,
columns=enc.get_feature_names([str(col)]))
if nofloats.any():
cats.loc[nofloats, :] = enc.transform(pd.DataFrame(
X_col[nofloats]))
cats["{}_dabl_continuous".format(col)] = X_new_col
result.append(cats)
return pd.concat(result, axis=1)
def get_feature_names(self, input_features=None):
feature_names = []
for col in self.columns_:
enc = self.encoders_[col]
feature_names.extend(enc.get_feature_names([str(col)]))
feature_names.append("{}_dabl_continuous".format(col))
return feature_names
def guess_ordinal(values):
# compare against http://proceedings.mlr.press/v70/valera17a/valera17a.pdf
# there's some ways to guess month, day, week, year
# but even if we have that, is that ordinal or categorical?
# worst hack in the history of probability distributions, maybe ever
# we compute second derivatives on the histogram. If they look smoother
# than the shuffled histograms, we assume order is meaningful
# why second derivatives? Why absolute norms? Why 1.5? good questions!
if values.min() < 0:
# we assume that negative numbers imply an ordering, not categories
# probably needs testing
return True
if values.max() > 100000:
# really large numbers are probably identifiers.
# also bincount will throw a memory error.
return False
counts = np.bincount(values)
def norm(x):
return np.abs(np.diff(np.diff(x))).sum()
grad_norm = norm(counts)
# shuffle 100 times
grad_norm_shuffled = np.mean([
norm(counts[np.random.permutation(len(counts))]) for i in range(100)])
return grad_norm * 1.5 < grad_norm_shuffled
def _string_is_date(series):
try:
pd.to_datetime(series[:10])
except (ParserError, pd.errors.OutOfBoundsDatetime, ValueError,
TypeError, OverflowError):
return False
try:
pd.to_datetime(series)
except (ParserError, pd.errors.OutOfBoundsDatetime, ValueError,
TypeError, OverflowError):
return False
return True
def _find_string_floats(X, dirty_float_threshold):
if not isinstance(X, pd.DataFrame):
# FIXME workaround to accept series
X = pd.DataFrame(X)
is_float = X.apply(_float_matching)
clean_float_string = is_float.all()
# remove 5 most common string values before checking if the rest is float
# FIXME 5 hardcoded!!
dirty_float = pd.Series(0, index=X.columns, dtype=bool)
for col in X.columns:
if clean_float_string[col]:
# already know it's clean
continue
X_col = X[col]
common_distinct_values = X_col.value_counts()[:5].index
is_common = X_col.isin(common_distinct_values) | X_col.isna()
if is_float.loc[~is_common, col].mean() > dirty_float_threshold:
dirty_float[col] = 1
return clean_float_string, dirty_float
def _float_col_is_int(series):
# test on a small subset for speed
# yes, a recursive call would be one line shorter.
if series[:10].isna().any():
return False
if (series[:10] != series[:10].astype(int)).any():
return False
if series.isna().any():
return False
if (series != series.astype(int)).any():
return False
return True
_FLOAT_TYPES = ['floating', 'mixed-interger-float', 'decimal']
_INTEGER_TYPES = ['integer']
_DATE_TYPES = ['datetime64', 'datetime', 'date',
'timedelta64', 'timedelta', 'time', 'period']
# FIXME we should be able to do better for mixed-integer
_OBJECT_TYPES = ['string', 'bytes', 'mixed', 'mixed-integer']
_CATEGORICAL_TYPES = ['categorical', 'boolean']
def _type_detection_int(series, max_int_cardinality='auto'):
n_distinct_values = series.nunique()
if n_distinct_values == len(series):
# could be an index
if series.iloc[0] == 0:
if (series == np.arange(len(series))).all():
# definitely an index
return 'useless'
elif series.iloc[0] == 1:
if (series == np.arange(1, len(series) + 1)).all():
# definitely an index
return 'useless'
if n_distinct_values > max_int_cardinality:
return 'continuous'
elif n_distinct_values <= 5:
# weird hack / edge case
return 'categorical'
else:
return 'low_card_int'
def _type_detection_float(series, max_int_cardinality='auto'):
if _float_col_is_int(series):
return _type_detection_int(
series, max_int_cardinality=max_int_cardinality)
return 'continuous'
def _type_detection_object(series, *, dirty_float_threshold,
max_int_cardinality='auto'):
clean_float_string, dirty_float = _find_string_floats(
series, dirty_float_threshold)
if dirty_float.any():
return 'dirty_float'
elif clean_float_string.any():
return _type_detection_float(
series.astype(float), max_int_cardinality=max_int_cardinality)
if _string_is_date(series):
return 'date'
if series.nunique() <= max_int_cardinality:
return 'categorical'
return "free_string"
def detect_type_series(series, *, dirty_float_threshold=0.9,
max_int_cardinality='auto',
near_constant_threshold=0.95, target_col=None):
n_distinct_values = series.nunique()
if series.isna().mean() > 0.99:
return 'useless'
# infer near-constant-values
# fast-pass if n_distinct_values is high
count = series.count()
if n_distinct_values == 1:
return 'useless'
if (n_distinct_values < (1 - near_constant_threshold) * count
and series.name != target_col):
if series.value_counts().max() > near_constant_threshold * count:
return 'useless'
if n_distinct_values == 2:
return 'categorical'
inferred_type = pd.api.types.infer_dtype(series)
if inferred_type in _DATE_TYPES:
return 'date'
elif inferred_type in _CATEGORICAL_TYPES:
return 'categorical'
elif inferred_type in _FLOAT_TYPES:
return _type_detection_float(
series, max_int_cardinality=max_int_cardinality)
elif inferred_type in _INTEGER_TYPES:
return _type_detection_int(
series, max_int_cardinality=max_int_cardinality)
elif inferred_type in _OBJECT_TYPES:
return _type_detection_object(
series, max_int_cardinality=max_int_cardinality,
dirty_float_threshold=dirty_float_threshold
)
else:
raise ValueError("WEEEEEIIIRRD")
def detect_types(X, type_hints=None, max_int_cardinality='auto',
dirty_float_threshold=.9,
near_constant_threshold=0.95, target_col=None,
verbose=0):
"""Detect types of dataframe columns.
Columns are labeled as one of the following types:
'continuous', 'categorical', 'low_card_int', 'dirty_float',
'free_string', 'date', 'useless'
Pandas categorical variables, strings and integers of low cardinality and
float values with two columns are labeled as categorical.
Integers of high cardinality are labeled as continuous.
Integers of intermediate cardinality are labeled as "low_card_int".
Float variables that sometimes take string values are labeled "dirty_float"
String variables with many unique values are labeled "free_text"
(and currently not processed by dabl).
Date types are labeled as "date" (and currently not processed by dabl).
Anything that is constant, nearly constant, detected as an integer index,
or doesn't match any of the above categories is labeled "useless".
Parameters
----------
X : dataframe
input
max_int_cardinality: int or 'auto', default='auto'
Maximum number of distinct integers for an integer column
to be considered categorical. 'auto' is ``max(42, n_samples/100)``.
Integers are also always considered as continuous variables.
FIXME not true any more?
dirty_float_threshold : float, default=.9
The fraction of floats required in a dirty continuous
column before it's considered "useless" or categorical
(after removing top 5 string values)
target_col : string, int or None
Specifies the target column in the data, if any.
Target columns are never dropped.
verbose : int
How verbose to be
Returns
-------
res : dataframe, shape (n_columns, 7)
Boolean dataframe of detected types. Rows are columns in input X,
columns are possible types (see above).
"""
# TODO detect top coding
# TODO subsample large datsets? one level up?
if not isinstance(X, pd.DataFrame):
raise TypeError("X is not a dataframe. Convert or call `clean`.")
if not X.index.is_unique:
raise ValueError("Non-unique index found. Reset index or call clean.")
duplicated = X.columns.duplicated()
if duplicated.any():
raise ValueError("Duplicate Columns: {}".format(
X.columns[duplicated]))
if type_hints is None:
type_hints = dict()
n_samples, _ = X.shape
if max_int_cardinality == "auto":
max_int_cardinality = max(42, n_samples / 100)
if n_samples <= 42:
# this is pretty hacky
max_int_cardinality = n_samples // 2
types_series = X.apply(lambda col: detect_type_series(
col, max_int_cardinality=max_int_cardinality,
near_constant_threshold=near_constant_threshold,
target_col=target_col, dirty_float_threshold=dirty_float_threshold))
for t in type_hints:
if t in X.columns:
types_series[t] = type_hints[t]
known_types = ['continuous', 'dirty_float', 'low_card_int', 'categorical',
'date', 'free_string', 'useless']
if X.empty:
return pd.DataFrame(columns=known_types, dtype=bool)
res = pd.DataFrame({t: types_series == t for t in known_types})
assert (X.columns == res.index).all()
assert np.all(res.sum(axis=1) == 1)
assert (types_series == res.idxmax(axis=1)).all()
if verbose >= 1:
print("Detected feature types:")
print(res.sum())
return res
def _apply_type_hints(X, type_hints):
if type_hints is not None:
# use type hints to convert columns
# to possibly avoid some work.
# means we need to copy X though.
X = X.copy()
for k, v in type_hints.items():
if v == "continuous":
X[k] = X[k].astype(float)
elif v == "categorical":
X[k] = X[k].astype('category')
elif v == 'useless' and k in X.columns:
X = X.drop(k, axis=1)
return X
def _select_cont(X):
return X.columns.str.endswith("_dabl_continuous")
def clean(X, type_hints=None, return_types=False,
target_col=None, verbose=0):
"""Public clean interface
Parameters
----------
type_hints : dict or None
If dict, provide type information for columns.
Keys are column names, values are types as provided by detect_types.
return_types : bool, default=False
Whether to return the inferred types
target_col : string, int or None
If not None specifies a target column in the data.
Target columns are never dropped.
verbose : int, default=0
Verbosity control.
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = _apply_type_hints(X, type_hints=type_hints)
if not X.index.is_unique:
warn("Index not unique, resetting index!", UserWarning)
X = X.reset_index(drop=True)
types_p = types = detect_types(X, type_hints=type_hints, verbose=verbose,
target_col=target_col)
# drop useless columns
X = X.loc[:, ~types.useless].copy()
types = types.loc[~types.useless, :]
for col in types.index[types.categorical]:
X[col] = X[col].astype('category', copy=False)
if types['dirty_float'].any():
# don't use ColumnTransformer that can't return dataframe yet
X_df = DirtyFloatCleaner().fit_transform(
X.loc[:, types['dirty_float']])
X = pd.concat([X.loc[:, ~types.dirty_float], X_df], axis=1)
# we should know what these are but maybe running this again is fine?
types_df = detect_types(X_df)
types = pd.concat([types[~types.dirty_float], types_df])
# discard dirty float targets that cant be converted to float
if target_col is not None and types_p['dirty_float'][target_col]:
warn("Discarding dirty_float targets that cannot be converted "
"to float.", UserWarning)
X = X.dropna(subset=["{}_dabl_continuous".format(target_col)])
X = X.rename(columns={"{}_dabl_continuous".format(
target_col): "{}".format(target_col)})
types = types.rename(index={"{}_dabl_continuous".format(
target_col): "{}".format(target_col)})
# deal with low cardinality ints
# TODO ?
# ensure that the indicator variables are also marked as categorical
# we could certainly do this nicer, but at this point calling
# detect_types shouldn't be expensive any more
# though if we have actual string columns that are free strings... hum
for col in types.index[types.categorical]:
# ensure categories are strings, otherwise imputation might fail
col_as_cat = X[col].astype('category', copy=False)
if col_as_cat.cat.categories.astype("str").is_unique:
# the world is good: converting to string keeps categories unique
X[col] = col_as_cat.cat.rename_categories(
lambda x: str(x))
else:
# we can't have nice things and need to convert to string
# before making categories (again)
warn("Duplicate categories of different types in column "
"{} considered equal {}".format(
col, col_as_cat.cat.categories))
X[col] = X[col].astype(str).astype('category', copy=False)
if return_types:
return X, types
return X
class EasyPreprocessor(BaseEstimator, TransformerMixin):
"""A simple preprocessor.
Detects variable types, encodes everything as floats
for use with sklearn.
Applies one-hot encoding, missing value imputation and scaling.
Attributes
----------
ct_ : ColumnTransformer
Main container for all transformations.
columns_ : pandas columns
Columns of training data.
dtypes_ : Series of dtypes
Dtypes of training data columns.
types_ : something
Inferred input types.
Parameters
----------
scale : boolean, default=True
Whether to scale continuous data.
force_imputation : bool, default=True
Whether to create imputers even if no training data is missing.
verbose : int, default=0
Control output verbosity.
"""
def __init__(self, scale=True, force_imputation=True, verbose=0,
types=None):
self.verbose = verbose
self.scale = scale
self.types = types
self.force_imputation = force_imputation
def fit(self, X, y=None):
"""A reference implementation of a fitting function for a transformer.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The training input samples.
y : None
There is no need of a target in a transformer, yet the pipeline API
requires this parameter.
Returns
-------
self : object
Returns self.
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.columns_ = X.columns
self.dtypes_ = X.dtypes
if self.types is None:
# FIXME some sanity check?
types = detect_types(X, verbose=self.verbose)
else:
types = self.types
types = types.copy()
# low card int encoded as categorical and continuous for now:
types.loc[types.low_card_int, 'continuous'] = True
types.loc[types.low_card_int, 'categorical'] = True
# go over variable blocks
# check for missing values
# scale etc
steps_categorical = []
if (self.force_imputation
or X.loc[:, types.categorical].isna().any(axis=None)):
steps_categorical.append(
SimpleImputer(strategy='most_frequent', add_indicator=True))
steps_categorical.append(
OneHotEncoder(categories='auto', handle_unknown='ignore',
sparse=False))
pipe_categorical = make_pipeline(*steps_categorical)
steps_continuous = []
if (self.force_imputation
or X.loc[:, types.continuous].isna().any(axis=None)
or types['dirty_float'].any()):
# we could skip the imputer here, but if there's dirty
# floats, they'll have NaN, and we reuse the cont pipeline
steps_continuous.append(SimpleImputer(strategy='median'))
if self.scale:
steps_continuous.append(StandardScaler())
# if X.loc[:, types['continuous']].isnull().values.any():
# FIXME doesn't work if missing values only in dirty column
pipe_continuous = make_pipeline(*steps_continuous)
# FIXME only have one imputer/standard scaler in all
# (right now copied in dirty floats and floats)
pipe_dirty_float = make_pipeline(
DirtyFloatCleaner(),
make_column_transformer(
(pipe_continuous, _select_cont), remainder="passthrough"))
# construct column transformer
transformer_cols = []
if types['continuous'].any():
transformer_cols.append(('continuous',
pipe_continuous, types['continuous']))
if types['categorical'].any():
transformer_cols.append(('categorical',
pipe_categorical, types['categorical']))
if types['dirty_float'].any():
# FIXME we're not really handling this here any more? (yes we are)
transformer_cols.append(('dirty_float',
pipe_dirty_float, types['dirty_float']))
if not len(transformer_cols):
raise ValueError("No feature columns found")
self.ct_ = ColumnTransformer(transformer_cols, sparse_threshold=.1)
self.ct_.fit(X)
self.input_shape_ = X.shape
self.types_ = types
# Return the transformer
return self
def get_feature_names(self):
# this can go soon hopefully
feature_names = []
for name, trans, cols in self.ct_.transformers_:
if name == "continuous":
# three should be no all-nan columns in the imputer
if (trans.steps[0][0] == "simpleimputer"
and np.isnan(trans.steps[0][1].statistics_).any()):
raise ValueError("So unexpected! Looks like the imputer"
" dropped some all-NaN columns."
"Try calling 'clean' on your data first.")
feature_names.extend(cols.index[cols])
elif name == 'categorical':
# this is the categorical pipe, extract one hot encoder
ohe = trans.steps[-1][1]
imputer = trans.steps[0][1]
ohe_cols = cols[cols].index
added_cols = ohe_cols[imputer.indicator_.features_].map(
lambda x: '{}_imputed'.format(x))
ohe_cols = ohe_cols.to_list()
ohe_cols.extend(added_cols)
feature_names.extend(ohe.get_feature_names(ohe_cols))
elif name == "remainder":
assert trans == "drop"
elif name == "dirty_float":
raise ValueError(
"Can't compute feature names when handling dirty floats. "
"Call 'clean' as a workaround")
else:
raise ValueError(
"Can't compute feature names for {}".format(name))
return feature_names
def transform(self, X):
""" A reference implementation of a transform function.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
X_transformed : array of int of shape = [n_samples, n_features]
The array containing the element-wise square roots of the values
in `X`.
"""
# Check is fit had been called
with warnings.catch_warnings():
# fix when requiring sklearn 0.22
# check_is_fitted will not have arguments any more
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
check_is_fitted(self, ['ct_'])
return self.ct_.transform(X)