In [1]:
import xgboost
print(xgboost.__version__)
from sklearn.datasets import make_classification

2.0.0


In [2]:
model = xgboost.XGBRFClassifier(  # or regressor
    n_estimators=100,  # number of trees
    subsample=0.9,  # no bootstrapping, but can take subsample without replacement. Values 0.8 or 0.9 are good because it ensures the dataset is large enough to train a good model but different enough to introduce diversity
    colsample_bynode=0.2  # no. features used at each split. Takes a percentage of columns from the dataset
)

In [4]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
print(X.shape, y.shape)

(1000, 20) (1000,)


In [6]:
from numpy import (mean, std)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# define model eval procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# eval model and collect scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=1)

# report performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean Accuracy: 0.890 (0.025)


In [7]:
# Now we can try using it as the final model, so we fit it on all the data and use predict
from numpy import asarray

model = xgboost.XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
model.fit(X, y)

# make a single prediction
row = [0.2929949, -4.21223056, -1.288332, -2.17849815, -0.64527665, 2.58097719, 0.28422388, -7.1827928, -2.80703576, 3.140559, 0.20690368, 3.98044255, 3.20046209, -0.83626539, 2.13684936, -6.945091, -1.83014102, 0.75920444, 0.74603947, -2.06694896]
row = asarray([row])
yhat = model.predict(row)
print('Predicted Class: %d' % yhat[0])

Predicted Class: 1


In [8]:
# Now let's try Regression
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=42)
print(X.shape, y.shape)

(1000, 20) (1000,)


In [13]:
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRFRegressor

model = XGBRFRegressor(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=1)

# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -121.738 (9.639)
