Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LightGBM - GBDT model earlystop bug fix #3614

Merged
merged 15 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 55 additions & 25 deletions deepchem/models/gbdt_models/gbdt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,19 @@ def __init__(self,
early_stopping_rounds: int, optional (default 50)
Activates early stopping. Validation metric needs to improve at least once
in every early_stopping_rounds round(s) to continue training.
eval_metric: Union[str, Callbale]
eval_metric: Union[str, Callable]
If string, it should be a built-in evaluation metric to use.
If callable, it should be a custom evaluation metric, see official note for more details.
"""

try:
import xgboost
import lightgbm
except:
raise ModuleNotFoundError(
"XGBoost or LightGBM modules not found. This function requires these modules to be installed."
)

if model_dir is not None:
if not os.path.exists(model_dir):
os.makedirs(model_dir)
Expand All @@ -55,27 +64,41 @@ def __init__(self,
self.early_stopping_rounds = early_stopping_rounds
self.model_type = self._check_model_type()

if self.early_stopping_rounds <= 0:
raise ValueError("Early Stopping Rounds cannot be less than 1.")

if self.model.__class__.__name__.startswith('XGB'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add a unit test for callbacks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'll do that

self.callbacks = [
xgboost.callback.EarlyStopping(
rounds=self.early_stopping_rounds)
]
elif self.model.__class__.__name__.startswith('LGBM'):
self.callbacks = [
lightgbm.early_stopping(
stopping_rounds=self.early_stopping_rounds),
]

if eval_metric is None:
if self.model_type == 'classification':
self.eval_metric: Optional[Union[str, Callable]] = 'auc'
elif self.model_type == 'regression':
self.eval_metric = 'mae'
if self.model_type == "classification":
self.eval_metric: Optional[Union[str, Callable]] = "auc"
elif self.model_type == "regression":
self.eval_metric = "mae"
else:
self.eval_metric = eval_metric
else:
self.eval_metric = eval_metric

def _check_model_type(self) -> str:
class_name = self.model.__class__.__name__
if class_name.endswith('Classifier'):
return 'classification'
elif class_name.endswith('Regressor'):
return 'regression'
elif class_name == 'NoneType':
return 'none'
if class_name.endswith("Classifier"):
return "classification"
elif class_name.endswith("Regressor"):
return "regression"
elif class_name == "NoneType":
return "none"
else:
raise ValueError(
'{} is not a supported model instance.'.format(class_name))
"{} is not a supported model instance.".format(class_name))

def fit(self, dataset: Dataset):
"""Fits GDBT model with all data.
Expand All @@ -98,7 +121,7 @@ def fit(self, dataset: Dataset):

seed = self.model.random_state
stratify = None
if self.model_type == 'classification':
if self.model_type == "classification":
stratify = y

# Find optimal n_estimators based on original learning_rate and early_stopping_rounds
Expand All @@ -107,17 +130,21 @@ def fit(self, dataset: Dataset):
test_size=0.2,
random_state=seed,
stratify=stratify)
self.model.fit(X_train,
y_train,
early_stopping_rounds=self.early_stopping_rounds,
eval_metric=self.eval_metric,
eval_set=[(X_test, y_test)])
self.model.fit(
X_train,
y_train,
callbacks=self.callbacks,
eval_metric=self.eval_metric,
eval_set=[(X_test, y_test)],
)

# retrain model to whole data using best n_estimators * 1.25
if self.model.__class__.__name__.startswith('XGB'):
estimated_best_round = np.round(self.model.best_iteration * 1.25)
estimated_best_round = np.round(
(self.model.best_iteration + 1) * 1.25)
else:
estimated_best_round = np.round(self.model.best_iteration_ * 1.25)

self.model.n_estimators = np.int64(estimated_best_round)
self.model.fit(X, y, eval_metric=self.eval_metric)

Expand All @@ -139,11 +166,13 @@ def fit_with_eval(self, train_dataset: Dataset, valid_dataset: Dataset):
if len(y_train.shape) != 1 or len(y_valid.shape) != 1:
raise ValueError("GDBT model doesn't support multi-output(task)")

self.model.fit(X_train,
y_train,
early_stopping_rounds=self.early_stopping_rounds,
eval_metric=self.eval_metric,
eval_set=[(X_valid, y_valid)])
self.model.fit(
X_train,
y_train,
callbacks=self.callbacks,
eval_metric=self.eval_metric,
eval_set=[(X_valid, y_valid)],
)


#########################################
Expand All @@ -156,5 +185,6 @@ class XGBoostModel(GBDTModel):
def __init__(self, *args, **kwargs):
warnings.warn(
"XGBoostModel is deprecated and has been renamed to GBDTModel.",
FutureWarning)
FutureWarning,
)
super(XGBoostModel, self).__init__(*args, **kwargs)
58 changes: 58 additions & 0 deletions deepchem/models/tests/test_gbdt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,61 @@ def test_reload_with_lightgbm():
# eval model on test
scores = reloaded_model.evaluate(test_dataset, [regression_metric])
assert scores[regression_metric.name] < 55


@unittest.skipIf(not has_xgboost_and_lightgbm,
'xgboost or lightgbm are not installed')
def test_earlystopping_with_xgboost():
np.random.seed(123)

# prepare dataset
N_samples = 50000
n_features = 1000
X = np.random.rand(N_samples, n_features)
y = np.random.rand(N_samples)
dataset = dc.data.NumpyDataset(X, y)

# xgboost test
xgb_model = xgboost.XGBRegressor(n_estimators=20, random_state=123)
model = dc.models.GBDTModel(xgb_model, early_stopping_rounds=3)
# fit trained model
model.fit(dataset)

# If ES rounds are more than total epochs, it will never trigger
if model.early_stopping_rounds < model.model.n_estimators:
# Find the number of boosting rounds in the model
res = list(model.model.evals_result_['validation_0'].values())
rounds_boosted = len(res[0])
# If rounds boosted are less than total estimators, it means ES was triggered
if rounds_boosted < model.model.n_estimators:
assert model.model.best_iteration < model.model.n_estimators - 1


@unittest.skipIf(not has_xgboost_and_lightgbm,
'xgboost or lightgbm are not installed')
def test_earlystopping_with_lightgbm():
np.random.seed(123)

# prepare dataset
N_samples = 50000
n_features = 1000
X = np.random.rand(N_samples, n_features)
y = np.random.rand(N_samples)
dataset = dc.data.NumpyDataset(X, y)

# lightgbm test
lgbm_model = lightgbm.LGBMRegressor(n_estimators=20,
random_state=123,
silent=True)
model = dc.models.GBDTModel(lgbm_model, early_stopping_rounds=3)
# fit trained model
model.fit(dataset)

# If ES rounds are more than total epochs, it will never trigger
if model.early_stopping_rounds < model.model.n_estimators:
# Find the number of boosting rounds in the model
res = list(model.model.evals_result_['valid_0'].values())
rounds_ran = len(res[0])
# If rounds ran are less than estimators, it means ES was triggered
if rounds_ran < model.model.n_estimators:
assert model.model.best_iteration_ < model.model.n_estimators
Loading