Fixes for the latest pandas. (#10266)

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
dmlc · May 12, 2024 · d81e319 · d81e319
1 parent 5e816e6
commit d81e319
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 28 deletions.
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -370,10 +370,8 @@ def pandas_feature_info(
     if feature_names is None and meta is None:
         if isinstance(data.columns, pd.MultiIndex):
             feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
-        elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
-            feature_names = list(map(str, data.columns))
         else:
-            feature_names = data.columns.format()
+            feature_names = list(data.columns.map(str))
 
     # handle feature types
     if feature_types is None and meta is None:
@@ -865,18 +863,30 @@ def _is_cudf_df(data: DataType) -> bool:
     return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
 
 
+def _get_cudf_cat_predicate() -> Callable[[Any], bool]:
+    try:
+        from cudf import CategoricalDtype
+
+        def is_categorical_dtype(dtype: Any) -> bool:
+            return isinstance(dtype, CategoricalDtype)
+
+    except ImportError:
+        try:
+            from cudf.api.types import is_categorical_dtype  # type: ignore
+        except ImportError:
+            from cudf.utils.dtypes import is_categorical_dtype  # type: ignore
+
+    return is_categorical_dtype
+
+
 def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
     """Extract CuDF __cuda_array_interface__.  This is special as it returns a new list
     of data and a list of array interfaces.  The data is list of categorical codes that
     caller can safely ignore, but have to keep their reference alive until usage of
     array interface is finished.
 
     """
-    try:
-        from cudf.api.types import is_categorical_dtype
-    except ImportError:
-        from cudf.utils.dtypes import is_categorical_dtype
-
+    is_categorical_dtype = _get_cudf_cat_predicate()
     interfaces = []
 
     def append(interface: dict) -> None:
@@ -908,12 +918,13 @@ def _transform_cudf_df(
     feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
 ) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
+
     try:
-        from cudf.api.types import is_bool_dtype, is_categorical_dtype
+        from cudf.api.types import is_bool_dtype
     except ImportError:
-        from cudf.utils.dtypes import is_categorical_dtype
         from pandas.api.types import is_bool_dtype
 
+    is_categorical_dtype = _get_cudf_cat_predicate()
     # Work around https://github.com/dmlc/xgboost/issues/10181
     if _is_cudf_ser(data):
         if is_bool_dtype(data.dtype):
@@ -941,15 +952,8 @@ def _transform_cudf_df(
             feature_names = [data.name]
         elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
             feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
-        elif (
-            lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
-            or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
-            # Unique to cuDF, no equivalence in pandas 1.3.3
-            or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
-        ):
-            feature_names = list(map(str, data.columns))
         else:
-            feature_names = data.columns.format()
+            feature_names = list(data.columns.map(str))
 
     # handle feature types
     if feature_types is None:

diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
@@ -280,10 +280,12 @@ def test_pandas_sparse(self):
             }
         )
         y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
-        dtrain = xgb.DMatrix(X, y)
+        with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
+            dtrain = xgb.DMatrix(X, y)
         booster = xgb.train({}, dtrain, num_boost_round=4)
-        predt_sparse = booster.predict(xgb.DMatrix(X))
-        predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
+        with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
+            predt_sparse = booster.predict(xgb.DMatrix(X))
+            predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
         np.testing.assert_allclose(predt_sparse, predt_dense)
 
     def test_pandas_label(
@@ -572,14 +574,16 @@ def test_pandas_sparse_column_split(self):
         y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
 
         def verify_pandas_sparse():
-            dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
+            with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
+                dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
             booster = xgb.train({}, dtrain, num_boost_round=4)
-            predt_sparse = booster.predict(
-                xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
-            )
-            predt_dense = booster.predict(
-                xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
-            )
+            with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
+                predt_sparse = booster.predict(
+                    xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+                )
+                predt_dense = booster.predict(
+                    xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
+                )
             np.testing.assert_allclose(predt_sparse, predt_dense)
 
         tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse)