From ad06172c6b19dde83e2e937904b64a2d87fe01f0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 26 Apr 2022 18:53:43 +0800 Subject: [PATCH] Refactor pandas dataframe handling. (#7843) --- python-package/xgboost/data.py | 133 ++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 00d47599fe73..47c41d994d8b 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -6,7 +6,7 @@ import json import warnings import os -from typing import Any, Tuple, Callable, Optional, List, Union, Iterator +from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type import numpy as np @@ -21,8 +21,6 @@ CAT_T = "c" # meta info that can be a matrix instead of vector. -# For now it's base_margin for multi-class, but it can be extended to label once we have -# multi-output. _matrix_meta = {"base_margin", "label"} @@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None: raise ValueError(msg) -# pylint: disable=too-many-locals -def _transform_pandas_df( +def _pandas_feature_info( data: DataFrame, + meta: Optional[str], + feature_names: FeatureNames, + feature_types: FeatureTypes, enable_categorical: bool, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, - meta: Optional[str] = None, - meta_type: Optional[str] = None, -) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]: +) -> Tuple[FeatureNames, FeatureTypes]: import pandas as pd from pandas.api.types import ( is_sparse, is_categorical_dtype, - is_integer_dtype, - is_bool_dtype, ) - nullable_alias = {"Int16", "Int32", "Int64"} - - # dtype: pd.core.arrays.numeric.NumericDtype - def is_nullable_dtype(dtype: Any) -> bool: - is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias - # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. - is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" - return is_int or is_bool - - if not all( - dtype.name in _pandas_dtype_mapper - or is_sparse(dtype) - or is_nullable_dtype(dtype) - or (is_categorical_dtype(dtype) and enable_categorical) - for dtype in data.dtypes - ): - _invalid_dataframe_dtype(data) - # handle feature names if feature_names is None and meta is None: if isinstance(data.columns, pd.MultiIndex): @@ -300,43 +276,94 @@ def is_nullable_dtype(dtype: Any) -> bool: # handle feature types if feature_types is None and meta is None: feature_types = [] - for i, dtype in enumerate(data.dtypes): + for dtype in data.dtypes: if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) + return feature_names, feature_types - # handle category codes. - transformed = pd.DataFrame() - # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented - if ( - enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes) - ) or any(is_nullable_dtype(dtype) for dtype in data.dtypes): - for i, dtype in enumerate(data.dtypes): - if is_categorical_dtype(dtype): - # pandas uses -1 as default missing value for categorical data - transformed[data.columns[i]] = ( - data[data.columns[i]] - .cat.codes.astype(np.float32) - .replace(-1.0, np.NaN) - ) - elif is_nullable_dtype(dtype): - # Converts integer to float NaN - transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32) - else: - transformed[data.columns[i]] = data[data.columns[i]] + +def is_nullable_dtype(dtype: Any) -> bool: + """Wether dtype is a pandas nullable type.""" + from pandas.api.types import is_integer_dtype, is_bool_dtype + # dtype: pd.core.arrays.numeric.NumericDtype + nullable_alias = {"Int16", "Int32", "Int64"} + is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias + # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. + is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" + return is_int or is_bool + + +def _pandas_cat_null(data: DataFrame) -> DataFrame: + from pandas.api.types import is_categorical_dtype + # handle category codes and nullable. + cat_columns = [ + col + for col, dtype in zip(data.columns, data.dtypes) + if is_categorical_dtype(dtype) + ] + nul_columns = [ + col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype) + ] + if cat_columns or nul_columns: + # Avoid transformation due to: PerformanceWarning: DataFrame is highly + # fragmented + transformed = data.copy() else: transformed = data + if cat_columns: + # DF doesn't have the cat attribute, so we use apply here + transformed[cat_columns] = ( + transformed[cat_columns] + .apply(lambda x: x.cat.codes) + .astype(np.float32) + .replace(-1.0, np.NaN) + ) + if nul_columns: + transformed[nul_columns] = transformed[nul_columns].astype(np.float32) + + return transformed + + +def _transform_pandas_df( + data: DataFrame, + enable_categorical: bool, + feature_names: FeatureNames = None, + feature_types: FeatureTypes = None, + meta: Optional[str] = None, + meta_type: Optional[str] = None, +) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]: + from pandas.api.types import ( + is_sparse, + is_categorical_dtype, + ) + + if not all( + dtype.name in _pandas_dtype_mapper + or is_sparse(dtype) + or is_nullable_dtype(dtype) + or (is_categorical_dtype(dtype) and enable_categorical) + for dtype in data.dtypes + ): + _invalid_dataframe_dtype(data) + + feature_names, feature_types = _pandas_feature_info( + data, meta, feature_names, feature_types, enable_categorical + ) + + transformed = _pandas_cat_null(data) + if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") - dtype = meta_type if meta_type else np.float32 - arr = transformed.values + dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32 + arr: np.ndarray = transformed.values if meta_type: - arr = arr.astype(meta_type) + arr = arr.astype(dtype) return arr, feature_names, feature_types