From 5309c7789839e91703a43795292200edaa003107 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 11 May 2024 01:48:39 +0800 Subject: [PATCH] Fixes for the latest pandas. --- python-package/xgboost/data.py | 40 ++++++++++++++++++-------------- tests/python/test_with_pandas.py | 24 +++++++++++-------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index bae96051e90c..28ee57cb768b 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -370,10 +370,8 @@ def pandas_feature_info( if feature_names is None and meta is None: if isinstance(data.columns, pd.MultiIndex): feature_names = [" ".join([str(x) for x in i]) for i in data.columns] - elif isinstance(data.columns, (pd.Index, pd.RangeIndex)): - feature_names = list(map(str, data.columns)) else: - feature_names = data.columns.format() + feature_names = list(data.columns.map(str)) # handle feature types if feature_types is None and meta is None: @@ -865,6 +863,22 @@ def _is_cudf_df(data: DataType) -> bool: return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") +def _get_cudf_cat_predicate() -> Callable[[Any], bool]: + try: + from cudf import CategoricalDtype + + def is_categorical_dtype(dtype: Any) -> bool: + return isinstance(dtype, CategoricalDtype) + + except ImportError: + try: + from cudf.api.types import is_categorical_dtype # type: ignore + except ImportError: + from cudf.utils.dtypes import is_categorical_dtype # type: ignore + + return is_categorical_dtype + + def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of data and a list of array interfaces. The data is list of categorical codes that @@ -872,11 +886,7 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: array interface is finished. """ - try: - from cudf.api.types import is_categorical_dtype - except ImportError: - from cudf.utils.dtypes import is_categorical_dtype - + is_categorical_dtype = _get_cudf_cat_predicate() interfaces = [] def append(interface: dict) -> None: @@ -908,12 +918,13 @@ def _transform_cudf_df( feature_types: Optional[FeatureTypes], enable_categorical: bool, ) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]: + try: - from cudf.api.types import is_bool_dtype, is_categorical_dtype + from cudf.api.types import is_bool_dtype except ImportError: - from cudf.utils.dtypes import is_categorical_dtype from pandas.api.types import is_bool_dtype + is_categorical_dtype = _get_cudf_cat_predicate() # Work around https://github.com/dmlc/xgboost/issues/10181 if _is_cudf_ser(data): if is_bool_dtype(data.dtype): @@ -941,15 +952,8 @@ def _transform_cudf_df( feature_names = [data.name] elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"): feature_names = [" ".join([str(x) for x in i]) for i in data.columns] - elif ( - lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex") - or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index") - # Unique to cuDF, no equivalence in pandas 1.3.3 - or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index") - ): - feature_names = list(map(str, data.columns)) else: - feature_names = data.columns.format() + feature_names = list(data.columns.map(str)) # handle feature types if feature_types is None: diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 8194f5947c16..27be831d3f88 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -280,10 +280,12 @@ def test_pandas_sparse(self): } ) y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) - dtrain = xgb.DMatrix(X, y) + with pytest.warns(UserWarning, match="Sparse arrays from pandas"): + dtrain = xgb.DMatrix(X, y) booster = xgb.train({}, dtrain, num_boost_round=4) - predt_sparse = booster.predict(xgb.DMatrix(X)) - predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) + with pytest.warns(UserWarning, match="Sparse arrays from pandas"): + predt_sparse = booster.predict(xgb.DMatrix(X)) + predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) np.testing.assert_allclose(predt_sparse, predt_dense) def test_pandas_label( @@ -572,14 +574,16 @@ def test_pandas_sparse_column_split(self): y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) def verify_pandas_sparse(): - dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL) + with pytest.warns(UserWarning, match="Sparse arrays from pandas"): + dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL) booster = xgb.train({}, dtrain, num_boost_round=4) - predt_sparse = booster.predict( - xgb.DMatrix(X, data_split_mode=DataSplitMode.COL) - ) - predt_dense = booster.predict( - xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL) - ) + with pytest.warns(UserWarning, match="Sparse arrays from pandas"): + predt_sparse = booster.predict( + xgb.DMatrix(X, data_split_mode=DataSplitMode.COL) + ) + predt_dense = booster.predict( + xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL) + ) np.testing.assert_allclose(predt_sparse, predt_dense) tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse)