[pyspark] Make QDM optional based on cuDF check (#8471) (#8556)

Co-authored-by: WeichenXu <weichen.xu@databricks.com>
dmlc · Dec 6, 2022 · 59c54e3 · 59c54e3
1 parent 60a8c8e
commit 59c54e3
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
@@ -43,6 +43,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     pandas_concat = None
     PANDAS_INSTALLED = False
 
+
 # sklearn
 try:
     from sklearn.base import BaseEstimator as XGBModelBase
@@ -72,6 +73,22 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
+_logger = logging.getLogger(__name__)
+
+
+def is_cudf_available() -> bool:
+    """Check cuDF package available or not"""
+    if importlib.util.find_spec("cudf") is None:
+        return False
+    try:
+        import cudf
+
+        return True
+    except ImportError:
+        _logger.exception("Importing cuDF failed, use DMatrix instead of QDM")
+        return False
+
+
 class XGBoostLabelEncoder(LabelEncoder):
     """Label encoder with JSON serialization methods."""
 

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -32,6 +32,7 @@
     ShortType,
 )
 from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
+from xgboost.compat import is_cudf_available
 from xgboost.core import Booster
 from xgboost.training import train as worker_train
 
@@ -759,7 +760,8 @@ def _fit(self, dataset):
             k: v for k, v in train_call_kwargs_params.items() if v is not None
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
-        use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+
+        use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
@@ -773,6 +775,15 @@ def _train_booster(pandas_df_iter):
 
             gpu_id = None
 
+            # If cuDF is not installed, then using DMatrix instead of QDM,
+            # because without cuDF, DMatrix performs better than QDM.
+            # Note: Checking `is_cudf_available` in spark worker side because
+            # spark worker might has different python environment with driver side.
+            if use_gpu:
+                use_qdm = use_hist and is_cudf_available()
+            else:
+                use_qdm = use_hist
+
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]