From 59c54e361b5795b84efab161c5c37abbdb8c5aab Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 7 Dec 2022 03:19:35 +0800 Subject: [PATCH] [pyspark] Make QDM optional based on cuDF check (#8471) (#8556) Co-authored-by: WeichenXu --- python-package/xgboost/compat.py | 17 +++++++++++++++++ python-package/xgboost/spark/core.py | 13 ++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 275b6621064d..fab734a01361 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -43,6 +43,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: pandas_concat = None PANDAS_INSTALLED = False + # sklearn try: from sklearn.base import BaseEstimator as XGBModelBase @@ -72,6 +73,22 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: XGBStratifiedKFold = None +_logger = logging.getLogger(__name__) + + +def is_cudf_available() -> bool: + """Check cuDF package available or not""" + if importlib.util.find_spec("cudf") is None: + return False + try: + import cudf + + return True + except ImportError: + _logger.exception("Importing cuDF failed, use DMatrix instead of QDM") + return False + + class XGBoostLabelEncoder(LabelEncoder): """Label encoder with JSON serialization methods.""" diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index eb1f4e7dfff8..caa6e3cd0931 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -32,6 +32,7 @@ ShortType, ) from scipy.special import expit, softmax # pylint: disable=no-name-in-module +from xgboost.compat import is_cudf_available from xgboost.core import Booster from xgboost.training import train as worker_train @@ -759,7 +760,8 @@ def _fit(self, dataset): k: v for k, v in train_call_kwargs_params.items() if v is not None } dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None} - use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist") + + use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist") def _train_booster(pandas_df_iter): """Takes in an RDD partition and outputs a booster for that partition after @@ -773,6 +775,15 @@ def _train_booster(pandas_df_iter): gpu_id = None + # If cuDF is not installed, then using DMatrix instead of QDM, + # because without cuDF, DMatrix performs better than QDM. + # Note: Checking `is_cudf_available` in spark worker side because + # spark worker might has different python environment with driver side. + if use_gpu: + use_qdm = use_hist and is_cudf_available() + else: + use_qdm = use_hist + if use_qdm and (booster_params.get("max_bin", None) is not None): dmatrix_kwargs["max_bin"] = booster_params["max_bin"]