/
_table_vectorizer.py
831 lines (729 loc) · 34.8 KB
/
_table_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
"""
Implements the TableVectorizer: a preprocessor to automatically apply
transformers/encoders to different types of data, without the need to
manually categorize them beforehand, or construct complex Pipelines.
"""
import warnings
from typing import Dict, List, Literal, Optional, Tuple, Union
from warnings import warn
import numpy as np
import pandas as pd
import sklearn
from pandas._libs.tslibs.parsing import guess_datetime_format
from pandas.core.dtypes.base import ExtensionDtype
from sklearn import __version__ as sklearn_version
from sklearn.base import TransformerMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.deprecation import deprecated
from sklearn.utils.validation import check_is_fitted
from dirty_cat import DatetimeEncoder, GapEncoder
from dirty_cat._utils import parse_version
# Required for ignoring lines too long in the docstrings
# flake8: noqa: E501
def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> Optional[str]:
"""Infer the date format of a date column,
by finding a format which should work for all dates in the column.
Parameters
----------
date_column : :obj:`~pandas.Series`
A column of dates, as strings.
n_trials : int, default=100
Number of rows to use to infer the date format.
Returns
-------
str or None
The date format inferred from the column.
If no format could be inferred, returns None.
"""
if len(date_column) == 0:
return
date_column_sample = date_column.dropna().sample(
frac=min(n_trials / len(date_column), 1), random_state=42
)
# try to infer the date format
# see if either dayfirst or monthfirst works for all the rows
with warnings.catch_warnings():
# pandas warns when dayfirst is not strictly applied
warnings.simplefilter("ignore")
date_format_monthfirst = date_column_sample.apply(
lambda x: guess_datetime_format(x)
)
date_format_dayfirst = date_column_sample.apply(
lambda x: guess_datetime_format(x, dayfirst=True),
)
# if one row could not be parsed, return None
if date_format_monthfirst.isnull().any() or date_format_dayfirst.isnull().any():
return
# even with dayfirst=True, monthfirst format can be inferred
# so we need to check if the format is the same for all the rows
elif date_format_monthfirst.nunique() == 1:
# one monthfirst format works for all the rows
# check if another format works for all the rows
# if so, raise a warning
if date_format_dayfirst.nunique() == 1:
# check if monthfirst and dayfirst haven't found the same format
if date_format_monthfirst.iloc[0] != date_format_dayfirst.iloc[0]:
warnings.warn(
f"""
Both {date_format_monthfirst.iloc[0]} and {date_format_dayfirst.iloc[0]} are valid
formats for the dates in column {date_column.name}.
Format {date_format_monthfirst.iloc[0]} will be used.
""",
UserWarning,
stacklevel=2,
)
return date_format_monthfirst.iloc[0]
elif date_format_dayfirst.nunique() == 1:
# only this format works for all the rows
return date_format_dayfirst.iloc[0]
else:
# more than two different formats were found
# TODO: maybe we could deal with this case
return
def _has_missing_values(df: Union[pd.DataFrame, pd.Series]) -> bool:
"""
Returns True if `array` contains missing values, False otherwise.
"""
return any(df.isnull())
def _replace_false_missing(
df: Union[pd.DataFrame, pd.Series]
) -> Union[pd.DataFrame, pd.Series]:
"""
Takes a DataFrame or a Series, and replaces the "false missing", that is,
strings that designate a missing value, but do not have the corresponding
type. We convert these strings to np.nan.
Also replaces `None` to np.nan.
"""
# Should not replace "missing" (the string used for imputation in
# categorical features).
STR_NA_VALUES = [
"null",
"",
"1.#QNAN",
"#NA",
"nan",
"#N/A N/A",
"-1.#QNAN",
"<NA>",
"-1.#IND",
"-nan",
"n/a",
"-NaN",
"1.#IND",
"NULL",
"NA",
"N/A",
"#N/A",
"NaN",
] # taken from pandas.io.parsers (version 1.1.4)
df = df.replace(STR_NA_VALUES + [None, "?", "..."], np.nan)
df = df.replace(r"^\s+$", np.nan, regex=True) # Replace whitespaces
return df
def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Series:
"""
Takes a Series with string data,
replaces the missing values, and returns it.
"""
ser = _replace_false_missing(ser)
if pd.api.types.is_categorical_dtype(ser) and (value not in ser.cat.categories):
ser = ser.cat.add_categories([value])
ser = ser.fillna(value=value)
return ser
OptionalTransformer = Optional[
Union[TransformerMixin, Literal["drop", "remainder", "passthrough"]]
]
class TableVectorizer(ColumnTransformer):
"""Automatically transform a heterogeneous dataframe to a numerical array.
Easily transforms a heterogeneous data table
(such as a :obj:`~pandas.DataFrame`) to a numerical array for machine
learning. For this it transforms each column depending on its data type.
It provides a simplified interface for the
:class:`~sklearn.compose.ColumnTransformer`; more documentation of
attributes and functions are available in its doc.
.. versionadded:: 0.2.0
Parameters
----------
cardinality_threshold : int, default=40
Two lists of features will be created depending on this value: strictly
under this value, the low cardinality categorical features, and above or
equal, the high cardinality categorical features.
Different transformers will be applied to these two groups,
defined by the parameters `low_card_cat_transformer` and
`high_card_cat_transformer` respectively.
Note: currently, missing values are counted as a single unique value
(so they count in the cardinality).
low_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
Transformer used on categorical/string features with low cardinality
(threshold is defined by `cardinality_threshold`).
Can either be a transformer object instance
(e.g. :class:`~sklearn.preprocessing.OneHotEncoder`),
a :class:`~sklearn.pipeline.Pipeline` containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns,
or `None` to use the default transformer
(:class:`~sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore", drop="if_binary")`).
Features classified under this category are imputed based on the
strategy defined with `impute_missing`.
high_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
Transformer used on categorical/string features with high cardinality
(threshold is defined by `cardinality_threshold`).
Can either be a transformer object instance
(e.g. :class:`~dirty_cat.GapEncoder`),
a :class:`~sklearn.pipeline.Pipeline` containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns,
or `None` to use the default transformer
(:class:`~dirty_cat.GapEncoder(n_components=30)`).
Features classified under this category are imputed based on the
strategy defined with `impute_missing`.
numerical_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
Transformer used on numerical features.
Can either be a transformer object instance
(e.g. :class:`~sklearn.preprocessing.StandardScaler`),
a :class:`~sklearn.pipeline.Pipeline` containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns,
or `None` to use the default transformer (here nothing, so 'passthrough').
Features classified under this category are not imputed at all
(regardless of `impute_missing`).
datetime_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
Transformer used on datetime features.
Can either be a transformer object instance
(e.g. :class:`~dirty_cat.DatetimeEncoder`),
a :class:`~sklearn.pipeline.Pipeline` containing the preprocessing steps,
'drop' for dropping the columns,
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns,
or `None` to use the default transformer (:class:`~dirty_cat.DatetimeEncoder()`).
Features classified under this category are not imputed at all
(regardless of `impute_missing`).
auto_cast : bool, optional, default=True
If set to `True`, will try to convert each column to the best possible
data type (dtype).
impute_missing : {'auto', 'force', 'skip'}, default='auto'
When to impute missing values in categorical (textual) columns.
'auto' will impute missing values if it is considered appropriate
(we are using an encoder that does not support missing values and/or
specific versions of pandas, numpy and scikit-learn).
'force' will impute missing values in all categorical columns.
'skip' will not impute at all.
When imputed, missing values are replaced by the string 'missing'.
As imputation logic for numerical features can be quite intricate,
it is left to the user to manage.
See also attribute :attr:`~dirty_cat.TableVectorizer.imputed_columns_`.
remainder : {'drop', 'passthrough'} or Transformer, default='drop'
By default, only the specified columns in `transformers` are
transformed and combined in the output, and the non-specified
columns are dropped. (default 'drop').
By specifying `remainder='passthrough'`, all remaining columns that
were not specified in `transformers` will be automatically passed
through. This subset of columns is concatenated with the output of
the transformers.
By setting `remainder` to be an estimator, the remaining
non-specified columns will use the `remainder` estimator. The
estimator must support :term:`fit` and :term:`transform`.
Note that using this feature requires that the DataFrame columns
input at :term:`fit` and :term:`transform` have identical order.
sparse_threshold : float, default=0.3
If the output of the different transformers contains sparse matrices,
these will be stacked as a sparse matrix if the overall density is
lower than this value. Use `sparse_threshold=0` to always return dense.
When the transformed output consists of all dense data, the stacked
result will be dense, and this keyword will be ignored.
n_jobs : int, optional
Number of jobs to run in parallel.
``None`` (the default) means 1 unless in a
:obj:`joblib.parallel_backend` context.
``-1`` means using all processors.
transformer_weights : dict, optional
Multiplicative weights for features per transformer. The output of the
transformer is multiplied by these weights. Keys are transformer names,
values the weights.
verbose : bool, default=False
If True, the time elapsed while fitting each transformer will be
printed as it is completed.
Attributes
----------
transformers_ : list of 3-tuples (str, Transformer or str, list of str)
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. In case there were no columns
selected, this will be an unfitted transformer.
If there are remaining columns, the final element is a tuple of the
form:
('remainder', transformer, remaining_columns) corresponding to the
`remainder` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
``len(transformers_)==len(transformers)``.
columns_ : :obj:`~pandas.Index`
The fitted array's columns. They are applied to the data passed
to the `transform` method.
types_ : dict mapping of str to type
A mapping of inferred types per column.
Key is the column name, value is the inferred dtype.
Exists only if `auto_cast=True`.
imputed_columns_ : list of str
The list of columns in which we imputed the missing values.
See Also
--------
:class:`dirty_cat.GapEncoder` :
Encodes dirty categories (strings) by constructing latent topics with continuous encoding.
:class:`dirty_cat.MinHashEncoder` :
Encode string columns as a numeric array with the minhash method.
:class:`dirty_cat.SimilarityEncoder` :
Encode string columns as a numeric array with n-gram string similarity.
Notes
-----
The column order of the input data is not guaranteed to be the same
as the output data (returned by :func:`~TableVectorizer.transform`).
This is a due to the way the :class:`~sklearn.compose.ColumnTransformer`
works.
However, the output column order will always be the same for different
calls to :func:`~TableVectorizer.transform` on a same fitted
:class:`TableVectorizer` instance.
For example, if input data has columns ['name', 'job', 'year'], then output
columns might be shuffled, e.g. ['job', 'year', 'name'], but every call
to :func:`~TableVectorizer.transform` on this instance will return this
order.
Examples
--------
Fit a :class:`TableVectorizer` on an example dataset:
>>> from dirty_cat.datasets import fetch_employee_salaries
>>> ds = fetch_employee_salaries()
>>> ds.X.head(3)
gender department department_name division assignment_category employee_position_title underfilled_job_title date_first_hired year_first_hired
0 F POL Department of Police MSB Information Mgmt and Tech Division Records... Fulltime-Regular Office Services Coordinator NaN 09/22/1986 1986
1 M POL Department of Police ISB Major Crimes Division Fugitive Section Fulltime-Regular Master Police Officer NaN 09/12/1988 1988
2 F HHS Department of Health and Human Services Adult Protective and Case Management Services Fulltime-Regular Social Worker IV NaN 11/19/1989 1989
>>> tv = TableVectorizer()
>>> tv.fit(ds.X)
Now, we can inspect the transformers assigned to each column:
>>> tv.transformers_
[
('datetime', DatetimeEncoder(), ['date_first_hired']),
('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'),
['gender', 'department', 'department_name', 'assignment_category']),
('high_card_cat', GapEncoder(n_components=30),
['division', 'employee_position_title', 'underfilled_job_title']),
('remainder', 'passthrough', ['year_first_hired'])
]
"""
transformers_: List[Tuple[str, Union[str, TransformerMixin], List[str]]]
columns_: pd.Index
types_: Dict[str, type]
imputed_columns_: List[str]
# Override required parameters
_required_parameters = []
def __init__(
self,
*,
cardinality_threshold: int = 40,
low_card_cat_transformer: OptionalTransformer = None,
high_card_cat_transformer: OptionalTransformer = None,
numerical_transformer: OptionalTransformer = None,
datetime_transformer: OptionalTransformer = None,
auto_cast: bool = True,
impute_missing: Literal["auto", "force", "skip"] = "auto",
# The next parameters are inherited from ColumnTransformer
remainder: Union[
Literal["drop", "passthrough"], TransformerMixin
] = "passthrough",
sparse_threshold: float = 0.3,
n_jobs: int = None,
transformer_weights=None,
verbose: bool = False,
):
super().__init__(transformers=[])
self.cardinality_threshold = cardinality_threshold
self.low_card_cat_transformer = low_card_cat_transformer
self.high_card_cat_transformer = high_card_cat_transformer
self.numerical_transformer = numerical_transformer
self.datetime_transformer = datetime_transformer
self.auto_cast = auto_cast
self.impute_missing = impute_missing
self.remainder = remainder
self.sparse_threshold = sparse_threshold
self.n_jobs = n_jobs
self.transformer_weights = transformer_weights
self.verbose = verbose
def _more_tags(self):
"""
Used internally by sklearn to ease the estimator checks.
"""
return {"allow_nan": [True]}
def _clone_transformers(self):
"""
For each of the different transformers that can be passed,
create the corresponding variable name with a trailing underscore,
which is the value that will be used in `transformers`.
We clone the instances to avoid altering them.
See the clone function docstring.
Note: typos are not detected here, they are left in and are detected
down the line in :func:`~sklearn.compose.ColumnTransformer.fit_transform`.
"""
if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin):
self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
elif self.low_card_cat_transformer is None:
if parse_version(sklearn_version) >= parse_version("1.0"):
# sklearn is lenient and lets us use both
# `handle_unknown="ignore"` and `drop="if_binary"`
# at the same time
self.low_card_cat_transformer_ = OneHotEncoder(
drop="if_binary", handle_unknown="ignore"
) # TODO change to "infrequent_if_exists" when we bump sklearn min version to 1.1
else:
# sklearn is not lenient, and does not let us use both
# `handle_unknown="ignore"` and `drop="if_binary"`
# at the same time, so we use `handle_unknown="error"` instead
self.low_card_cat_transformer_ = OneHotEncoder(
drop="if_binary", handle_unknown="error"
)
warn(
f"You are using scikit-learn={sklearn_version}. "
"Upgrade to scikit-learn>=1.0 to use "
"handle_unknown='ignore'. "
"Otherwise, pass a OneHotEncoder with drop=None as "
"low_card_cat_transformer. "
"Using handle_unknown='error' in low_card_cat_transformer.",
stacklevel=2,
)
elif self.low_card_cat_transformer == "remainder":
self.low_card_cat_transformer_ = self.remainder
else:
self.low_card_cat_transformer_ = self.low_card_cat_transformer
if isinstance(self.high_card_cat_transformer, sklearn.base.TransformerMixin):
self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer)
elif self.high_card_cat_transformer is None:
self.high_card_cat_transformer_ = GapEncoder(n_components=30)
elif self.high_card_cat_transformer == "remainder":
self.high_card_cat_transformer_ = self.remainder
else:
self.high_card_cat_transformer_ = self.high_card_cat_transformer
if isinstance(self.numerical_transformer, sklearn.base.TransformerMixin):
self.numerical_transformer_ = clone(self.numerical_transformer)
elif self.numerical_transformer is None:
self.numerical_transformer_ = "passthrough"
elif self.numerical_transformer == "remainder":
self.numerical_transformer_ = self.remainder
else:
self.numerical_transformer_ = self.numerical_transformer
if isinstance(self.datetime_transformer, sklearn.base.TransformerMixin):
self.datetime_transformer_ = clone(self.datetime_transformer)
elif self.datetime_transformer is None:
self.datetime_transformer_ = DatetimeEncoder()
elif self.datetime_transformer == "remainder":
self.datetime_transformer_ = self.remainder
else:
self.datetime_transformer_ = self.datetime_transformer
# TODO: check that the provided transformers are valid
def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame:
"""Takes a dataframe and tries to convert its columns to their best possible data type.
Parameters
----------
X : :obj:`~pandas.DataFrame` of shape (n_samples, n_features)
The data to be transformed.
Returns
-------
:obj:`~pandas.DataFrame`
The same :obj:`~pandas.DataFrame`, with its columns cast to their
best possible data type.
"""
# We replace in all columns regardless of their type,
# as we might have some false missing
# in numerical columns for instance.
X = _replace_false_missing(X)
# Handle missing values
for col in X.columns:
# Convert pandas' NaN value (pd.NA) to numpy NaN value (np.nan)
# because the former tends to raise all kind of issues when dealing
# with scikit-learn (as of version 0.24).
if _has_missing_values(X[col]):
# Some numerical dtypes like Int64 or Float64 only support
# pd.NA, so they must be converted to np.float64 before.
if pd.api.types.is_numeric_dtype(X[col]):
X[col] = X[col].astype(np.float64)
X[col].fillna(value=np.nan, inplace=True)
# Convert to the best possible data type
self.types_ = {}
for col in X.columns:
if not pd.api.types.is_datetime64_any_dtype(X[col]):
# we don't want to cast datetime64
try:
X[col] = pd.to_numeric(X[col], errors="raise")
except (ValueError, TypeError):
# Only try to convert to datetime
# if the variable isn't numeric.
format = _infer_date_format(X[col])
if format is not None:
X[col] = pd.to_datetime(X[col], errors="raise", format=format)
# Cast pandas dtypes to numpy dtypes
# for earlier versions of sklearn. FIXME: which ?
if issubclass(X[col].dtype.__class__, ExtensionDtype):
try:
X[col] = X[col].astype(X[col].dtype.type, errors="ignore")
except (TypeError, ValueError):
pass
self.types_.update({col: X[col].dtype})
return X
def _apply_cast(self, X: pd.DataFrame) -> pd.DataFrame:
"""Takes a dataframe, and applies the best data types learnt during fitting.
Does the same thing as `_auto_cast`, but applies learnt info.
"""
for col in X.columns:
X[col] = _replace_false_missing(X[col])
if _has_missing_values(X[col]):
if pd.api.types.is_numeric_dtype(X[col]):
X[col] = X[col].astype(np.float64)
X[col].fillna(value=np.nan, inplace=True)
for col in self.imputed_columns_:
X[col] = _replace_missing_in_cat_col(X[col])
for col, dtype in self.types_.items():
# if categorical, add the new categories to prevent
# them to be encoded as nan
if pd.api.types.is_categorical_dtype(dtype):
known_categories = dtype.categories
new_categories = pd.unique(X[col])
dtype = pd.CategoricalDtype(
categories=known_categories.union(new_categories)
)
self.types_[col] = dtype
X.loc[:, col] = X[col].astype(dtype)
return X
def fit_transform(self, X, y=None):
"""Fit all transformers, transform the data, and concatenate the results.
In practice, it (1) converts features to their best possible types
if `auto_cast=True`, (2) classify columns based on their data type,
(3) replaces "false missing" (see :func:`_replace_false_missing`),
and imputes categorical columns depending on `impute_missing`, and
finally, transforms `X`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data, of which specified subsets are used to fit the
transformers.
y : array-like of shape (n_samples,), optional
Targets for supervised learning.
Returns
-------
{array-like, sparse matrix} of shape (n_samples, sum_n_components)
Hstack of results of transformers. sum_n_components is the
sum of n_components (output dimension) over transformers. If
any result is a sparse matrix, everything will be converted to
sparse matrices.
"""
if self.impute_missing not in ("skip", "force", "auto"):
raise ValueError(
"Invalid value for `impute_missing`, expected any of "
"{'auto', 'force', 'skip'}, "
f"got {self.impute_missing!r}. "
)
self._clone_transformers()
# Convert to pandas DataFrame if not already.
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
else:
# Create a copy to avoid altering the original data.
X = X.copy()
self.columns_ = X.columns
# If auto_cast is True, we'll find and apply the best possible type
# to each column.
# We'll keep the results in order to apply the types in `transform`.
if self.auto_cast:
X = self._auto_cast(X)
# Select columns by dtype
numeric_columns = X.select_dtypes(
include=[
"int",
"float",
np.float64,
np.float32,
np.float16,
np.int64,
np.int32,
np.int16,
np.uint64,
np.uint32,
np.uint16,
]
).columns.to_list()
categorical_columns = X.select_dtypes(
include=["string", "object", "category"]
).columns.to_list()
datetime_columns = X.select_dtypes(
include=["datetime", "datetimetz"]
).columns.to_list()
# Classify categorical columns by cardinality
_nunique_values = { # Cache results
col: X[col].nunique() for col in categorical_columns
}
low_card_cat_columns = [
col
for col in categorical_columns
if _nunique_values[col] < self.cardinality_threshold
]
high_card_cat_columns = [
col
for col in categorical_columns
if _nunique_values[col] >= self.cardinality_threshold
]
# Clear cache
del _nunique_values
# Next part: construct the transformers
# Create the list of all the transformers.
all_transformers: List[Tuple[str, OptionalTransformer, List[str]]] = [
("numeric", self.numerical_transformer, numeric_columns),
("datetime", self.datetime_transformer_, datetime_columns),
("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns),
("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns),
]
# We will now filter this list, by keeping only the ones with:
# - at least one column
# - a valid encoder or string (filter out if None)
self.transformers = []
for trans in all_transformers:
name, enc, cols = trans # Unpack
if len(cols) > 0 and enc is not None:
self.transformers.append(trans)
self.imputed_columns_ = []
if self.impute_missing != "skip":
# First, replace false missing
# This is technically redundant with the call made in `_auto_cast`,
# but we do it again anyway.
X = _replace_false_missing(X)
# Then, impute if suiting
if _has_missing_values(X):
if self.impute_missing == "force":
for col in X.columns:
# Only impute categorical columns
if col in categorical_columns:
X[col] = _replace_missing_in_cat_col(X[col])
self.imputed_columns_.append(col)
elif self.impute_missing == "auto":
for name, trans, cols in all_transformers:
impute: bool = False
if isinstance(trans, OneHotEncoder) and parse_version(
sklearn_version
) < parse_version("0.24"):
impute = True
if impute:
for col in cols:
# Only impute categorical columns
if col in categorical_columns:
X[col] = _replace_missing_in_cat_col(X[col])
self.imputed_columns_.append(col)
# If there was missing values imputation, we cast the DataFrame again,
# as pandas gives different types depending on whether a column has
# missing values or not.
if self.imputed_columns_ and self.auto_cast:
X = self._auto_cast(X)
if self.verbose:
print(f"[TableVectorizer] Assigned transformers: {self.transformers}")
X_enc = super().fit_transform(X, y)
# For the "remainder" columns, the `ColumnTransformer` `transformers_`
# attribute contains the index instead of the column name,
# so we convert the values to the appropriate column names
# if there is less than 20 columns in the remainder.
for i, (name, enc, cols) in enumerate(self.transformers_):
if name == "remainder" and len(cols) < 20:
# In this case, "cols" is a list of ints (the indices)
cols: List[int]
self.transformers_[i] = (name, enc, [self.columns_[j] for j in cols])
return X_enc
def transform(self, X) -> np.ndarray:
"""Transform `X` by applying the fitted transformers on the columns.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to be transformed.
Returns
-------
{array-like, sparse matrix} of shape (n_samples, sum_n_components)
Hstack of results of transformers. sum_n_components is the
sum of n_components (output dimension) over transformers. If
any result is a sparse matrix, everything will be converted to
sparse matrices.
"""
check_is_fitted(self, attributes=["columns_"])
if X.shape[1] != len(self.columns_):
raise ValueError(
"Passed array does not match column count of "
f"array seen during fit. Got {X.shape[1]} "
f"columns, expected {len(self.columns_)}"
)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
else:
# Create a copy to avoid altering the original data.
X = X.copy()
if (X.columns != self.columns_).all():
X.columns = self.columns_
if self.auto_cast:
X = self._apply_cast(X)
return super().transform(X)
def get_feature_names_out(self, input_features=None) -> List[str]:
"""Return clean feature names.
Feature names are formatted like:
"<column_name>_<value>" if encoded by
:class:`~sklearn.preprocessing.OneHotEncoder` or alike,
(e.g. "job_title_Police officer"), or "<column_name>" otherwise.
Parameters
----------
input_features : None
Unused, only here for compatibility.
Returns
-------
list of str
Feature names.
"""
if parse_version(sklearn_version) < parse_version("1.0"):
ct_feature_names = super().get_feature_names()
else:
ct_feature_names = super().get_feature_names_out()
all_trans_feature_names = []
for name, trans, cols, _ in self._iter(fitted=True):
if isinstance(trans, str):
if trans == "drop":
continue
elif trans == "passthrough":
if all(isinstance(col, int) for col in cols):
cols = [self.columns_[i] for i in cols]
all_trans_feature_names.extend(cols)
continue
if parse_version(sklearn_version) < parse_version("1.0"):
trans_feature_names = trans.get_feature_names(cols)
else:
trans_feature_names = trans.get_feature_names_out(cols)
all_trans_feature_names.extend(trans_feature_names)
if len(ct_feature_names) != len(all_trans_feature_names):
warn("Could not extract clean feature names; returning defaults. ")
return list(ct_feature_names)
return all_trans_feature_names
def get_feature_names(self, input_features=None) -> List[str]:
"""Return clean feature names. Compatibility method for sklearn < 1.0.
Use :func:`~TableVectorizer.get_feature_names_out` instead.
Parameters
----------
input_features : None
Unused, only here for compatibility.
Returns
-------
list of str
Feature names.
"""
if parse_version(sklearn_version) >= parse_version("1.0"):
warn(
"Following the changes in scikit-learn 1.0, "
"get_feature_names is deprecated. "
"Use get_feature_names_out instead. ",
DeprecationWarning,
stacklevel=2,
)
return self.get_feature_names_out(input_features)
@deprecated("Use TableVectorizer instead.")
class SuperVectorizer(TableVectorizer):
"""Deprecated name of TableVectorizer."""
pass