[breaking] Remove duplicated predict functions.

* Rename all `data` to `X`.
dmlc · Jan 12, 2021 · cf7660a · cf7660a
1 parent f2f7dd8
commit cf7660a
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 81 deletions.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -687,10 +687,16 @@ def fit(self, X, y, *, sample_weight=None, base_margin=None,
 
         return self
 
-    def predict(self, data, output_margin=False, ntree_limit=None,
-                validate_features=True, base_margin=None):
+    def predict(
+        self,
+        X,
+        output_margin=False,
+        ntree_limit=None,
+        validate_features=True,
+        base_margin=None
+    ):
         """
-        Predict with `data`.
+        Predict with `X`.
 
         .. note:: This function is not thread safe.
 
@@ -704,7 +710,7 @@ def predict(self, data, output_margin=False, ntree_limit=None,
 
         Parameters
         ----------
-        data : array_like
+        X : array_like
             Data to predict with
         output_margin : bool
             Whether to output the raw untransformed margin value.
@@ -723,16 +729,21 @@ def predict(self, data, output_margin=False, ntree_limit=None,
         prediction : numpy array
         """
         # pylint: disable=missing-docstring,invalid-name
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
+        test_dmatrix = DMatrix(X, base_margin=base_margin,
                                missing=self.missing, nthread=self.n_jobs)
         # get ntree_limit to use - if none specified, default to
         # best_ntree_limit if defined, otherwise 0.
         if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        return self.get_booster().predict(test_dmatrix,
-                                          output_margin=output_margin,
-                                          ntree_limit=ntree_limit,
-                                          validate_features=validate_features)
+            try:
+                ntree_limit = self.best_ntree_limit
+            except AttributeError:
+                ntree_limit = 0
+        return self.get_booster().predict(
+            test_dmatrix,
+            output_margin=output_margin,
+            ntree_limit=ntree_limit,
+            validate_features=validate_features
+        )
 
     def apply(self, X, ntree_limit=0):
         """Return the predicted leaf every tree for each sample.
@@ -1048,50 +1059,21 @@ def fit(self, X, y, *, sample_weight=None, base_margin=None,
         'Fit gradient boosting model',
         'Fit gradient boosting classifier', 1)
 
-    def predict(self, data, output_margin=False, ntree_limit=None,
-                validate_features=True, base_margin=None):
-        """
-        Predict with `data`.
-
-        .. note:: This function is not thread safe.
-
-          For each booster object, predict can only be called from one thread.
-          If you want to run prediction using multiple thread, call
-          ``xgb.copy()`` to make copies of model object and then call
-          ``predict()``.
-
-          .. code-block:: python
-
-            preds = bst.predict(dtest, ntree_limit=num_round)
-
-        Parameters
-        ----------
-        data : array_like
-            Feature matrix.
-        output_margin : bool
-            Whether to output the raw untransformed margin value.
-        ntree_limit : int
-            Limit number of trees in the prediction; defaults to
-            best_ntree_limit if defined (i.e. it has been trained with early
-            stopping), otherwise 0 (use all trees).
-        validate_features : bool
-            When this is True, validate that the Booster's and data's
-            feature_names are identical.  Otherwise, it is assumed that the
-            feature_names are the same.
-
-        Returns
-        -------
-        prediction : numpy array
-        """
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
-                               missing=self.missing, nthread=self.n_jobs)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        class_probs = self.get_booster().predict(
-            test_dmatrix,
+    def predict(
+        self,
+        X,
+        output_margin=False,
+        ntree_limit=None,
+        validate_features=True,
+        base_margin=None
+    ):
+        class_probs = super().predict(
+            X=X,
             output_margin=output_margin,
             ntree_limit=ntree_limit,
-            validate_features=validate_features)
+            validate_features=validate_features,
+            base_margin=base_margin
+        )
         if output_margin:
             # If output_margin is active, simply return the scores
             return class_probs
@@ -1136,13 +1118,13 @@ def predict_proba(self, X, ntree_limit=None, validate_features=False,
             a numpy array of shape array-like of shape (n_samples, n_classes) with the
             probability of each data example being of a given class.
         """
-        test_dmatrix = DMatrix(X, base_margin=base_margin,
-                               missing=self.missing, nthread=self.n_jobs)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-        class_probs = self.get_booster().predict(test_dmatrix,
-                                                 ntree_limit=ntree_limit,
-                                                 validate_features=validate_features)
+        class_probs = super().predict(
+            X=X,
+            output_margin=False,
+            ntree_limit=ntree_limit,
+            validate_features=validate_features,
+            base_margin=base_margin
+        )
         return _cls_predict_proba(self.objective, class_probs, np.vstack)
 
     def evals_result(self):
@@ -1510,18 +1492,3 @@ def fit(
             self.evals_result = evals_result
 
         return self
-
-    def predict(self, data, output_margin=False,
-                ntree_limit=0, validate_features=True, base_margin=None):
-
-        test_dmatrix = DMatrix(data, base_margin=base_margin,
-                               missing=self.missing)
-        if ntree_limit is None:
-            ntree_limit = getattr(self, "best_ntree_limit", 0)
-
-        return self.get_booster().predict(test_dmatrix,
-                                          output_margin=output_margin,
-                                          ntree_limit=ntree_limit,
-                                          validate_features=validate_features)
-
-    predict.__doc__ = XGBModel.predict.__doc__
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
@@ -114,7 +114,10 @@ def _train_internal(params, dtrain,
 
     num_groups = int(config['learner']['learner_model_param']['num_class'])
     num_groups = 1 if num_groups == 0 else num_groups
-    bst.best_ntree_limit = ((bst.best_iteration + 1) * num_parallel_tree * num_groups)
+    bst.set_attr(
+        best_ntree_limit=str((bst.best_iteration + 1) * num_parallel_tree * num_groups)
+    )
+    bst.best_ntree_limit = int(bst.attr("best_ntree_limit"))
 
     # Copy to serialise and unserialise booster to reset state and free
     # training memory
@@ -148,15 +151,16 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         Activates early stopping. Validation metric needs to improve at least once in
         every **early_stopping_rounds** round(s) to continue training.
         Requires at least one item in **evals**.
-        The method returns the model from the last iteration (not the best one).
-        If there's more than one item in **evals**, the last entry will be used
-        for early stopping.
+        The method returns the model from the last iteration (not the best one).  Use
+        custom callback or model slicing if the best model is desired.
+        If there's more than one item in **evals**, the last entry will be used for early
+        stopping.
         If there's more than one metric in the **eval_metric** parameter given in
         **params**, the last metric will be used for early stopping.
         If early stopping occurs, the model will have three additional fields:
-        ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.
-        (Use ``bst.best_ntree_limit`` to get the correct value if
-        ``num_parallel_tree`` and/or ``num_class`` appears in the parameters)
+        ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.  (Use
+        ``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
+        ``num_class`` appears in the parameters)
     evals_result: dict
         This dictionary stores the evaluation results of all the items in watchlist.