<a href="https://colab.research.google.com/github/ernanhughes/boosting-examples/blob/main/Optuna_XGBoost_hyperparameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imporing all the necessary modules
import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading the boston dataset
url = "https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/sklearn/datasets/data/boston_house_prices.csv"

from sklearn.datasets import load_boston
boston = load_boston()

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [None]:
df = pd.DataFrame(boston.data , columns = boston.feature_names)
df['target'] = boston.target

In [None]:
X = df.iloc[:,df.columns != 'target']
y = df.target


from sklearn.preprocessing import StandardScaler
se = StandardScaler()
X = se.fit_transform(X)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)


# we are converting the training and testing data into xgboost optimized matrix for efficiency

dtrain = xgb.DMatrix(X_train,y_train)
dtest  = xgb.DMatrix(X_test,y_test)


### Testing a base model

In [None]:

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
cv_ = KFold(n_splits=10,random_state=0)

xg_reg = xgb.XGBRegressor(
)
scores = cross_val_score(xg_reg, X_train,y_train , scoring = 'neg_root_mean_squared_error',n_jobs = -1,cv = cv_)
print(np.mean(scores), np.std(scores))
print(scores)

#### Creating a scoring funtion with the help of XGBOOST train inbuilt early stopping

here we are not tuning the n_estimators with the help of optuna


we will tune the n_estimators with the help of early_stopping by using the xgboost.train

In [None]:

# defing a function for scoring and calculation for the rmse (default for regression )
# in classification problem auc will be the default
import re
def return_rmse(params):
    model = xgb.train(params , dtrain, num_boost_round = 600, evals = [(dtest, 'eval')],
          early_stopping_rounds=20,verbose_eval = 0)
    result = model.eval(dtest)
    result = np.float(re.search(r'[\d.]+$',result).group(0))
    print(result)
    return(result)

In [None]:
return_rmse(study.best_params)

1. **params are the kwargs

In [None]:
import optuna
from optuna import Trial, visualization

from optuna.samplers import TPESampler

In [None]:

def objective(trial):

    param = {
#                 "n_estimators" : trial.suggest_int('n_estimators', 0, 500),
                'max_depth':trial.suggest_int('max_depth', 3, 5),
                'reg_alpha':trial.suggest_uniform('reg_alpha',0,6),
                'reg_lambda':trial.suggest_uniform('reg_lambda',0,2),
                'min_child_weight':trial.suggest_int('min_child_weight',0,5),
                'gamma':trial.suggest_uniform('gamma', 0, 4),
                'learning_rate':trial.suggest_loguniform('learning_rate',0.05,0.5),
                'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9),
                'subsample':trial.suggest_uniform('subsample',0.4,0.9),

                'nthread' : -1
            }
    return(return_rmse(param)) # this will return the rmse score


In [None]:
# calling the optuna study
study1 = optuna.create_study(direction='minimize',sampler=TPESampler())
study1.optimize(objective, n_trials= 1050,show_progress_bar = True)

In [None]:
trial = study1.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
### printing the best estimators
study1.best_params

In [None]:

# without using any hyperparameter tuning

params = {}
print(f"without tuning{return_rmse(params)}")
print(f"with tuning{return_rmse(study1.best_params)}")

## Plotting of our search history

In [None]:
optuna.visualization.plot_optimization_history(study1)

In [None]:
optuna.visualization.plot_slice(study1)


# Tuning again with more narrower values to get the sweet spot

In [None]:

def objective(trial):

    param = {
#                 "n_estimators" : trial.suggest_int('n_estimators', 0, 500),
                'max_depth':trial.suggest_int('max_depth',4,4),
                'reg_alpha':trial.suggest_uniform('reg_alpha',0,2),
                'reg_lambda':trial.suggest_uniform('reg_lambda',0.5,1),
                'min_child_weight':trial.suggest_int('min_child_weight',1,1),
                'gamma':trial.suggest_int('gamma',1,1),
                'learning_rate':trial.suggest_loguniform('learning_rate',0.2,0.4),
                'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.6),
                'subsample':trial.suggest_uniform('subsample',0.4,0.5),

                'nthread' : -1
            }
    return(return_rmse(param)) # this will return the rmse score


In [None]:
# calling the optuna study
study2 = optuna.create_study(direction='minimize',sampler=TPESampler())
study2.optimize(objective, n_trials= 1050,show_progress_bar = True)

In [None]:
optuna.visualization.plot_optimization_history(study2)

In [None]:
study2.best_params

In [None]:
params = {}
print(f"without tuning{return_rmse(params)}")
print(f"with tuning{return_rmse(study2.best_params)}")

## visualising the overfitting and underfitting and finding the best estimators

In [None]:
from sklearn.model_selection import validation_curve
param_range = np.arange(10, 250, 2)

# train_scores, test_scores = validation_curve(xg_reg_base,
#                                   X, y, param_name="n_estimators", param_range=param_range,
#                                   cv=cv_shuffle, scoring="neg_root_mean_squared_error", n_jobs=-1)
# train_mean = np.mean(train_scores, axis=1)
# train_std = np.std(train_scores, axis=1)
train_scores = [ ]
test_scores = [ ]
for i in param_range:
    xg_reg = xgb.XGBRegressor(**study2.best_params,

        n_estimators = i

)
    xg_reg.fit(X_train,y_train)
    train_scores.append(np.sqrt(mean_squared_error(y_train,xg_reg.predict(X_train))))
    test_scores.append(np.sqrt(mean_squared_error(y_test,xg_reg.predict(X_test))))

import matplotlib.pyplot as plt

plt.subplots(1, figsize=(7,7))
plt.plot(param_range, train_scores, label="Training score", color="black")
plt.plot(param_range, test_scores, label="Cross-validation score", color="dimgrey")

# plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
# plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")

plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Error")
plt.tight_layout()
plt.legend(loc="best")
plt.show()

In [None]:
##finding the best n_estimators with the early stopping

model = xgb.train(study2.best_params , dtrain, num_boost_round = 600, evals = [(dtest, 'eval')],
          early_stopping_rounds=20,verbose_eval = 1)



### final model

In [None]:
xgb_regressor = xgb.XGBRegressor(**study2.best_params, n_estimators = 71)
xgb_regressor_base = xgb.XGBRegressor()

In [None]:
xgb_regressor = xgb.XGBRegressor(**study2.best_params, n_estimators = 71)
xgb_regressor_base.fit(X_train,y_train)
score1 = xgb_regressor_base.score(X_test,y_test)
xgb_regressor.fit(X_train,y_train)
score2 = xgb_regressor.score(X_test,y_test)
print(f"R2 score withouth tuning:{score1} ,R2 score with tuning:{score2}")