In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
diamonds.shape

(53940, 10)

In [4]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X = diamonds.drop('price', axis=1)
y = diamonds[['price']]

# Extract text features (categorical features)
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

## XGBoost regression

In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
import xgboost as xgb

# Create regression matrices for native XGBoost API
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [7]:
## Training
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
# enables GPU acceleratrion

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [8]:
from sklearn.metrics import mean_squared_error

# Prediction
preds = model.predict(dtest_reg)

In [9]:
# Evaluation
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 555.607


In [10]:
# Using Validation Sets During Training
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

# The first element is the array for the model to evaluate, and the second is the array’s name.
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:2874.29379	validation-rmse:2817.38773
[1]	train-rmse:2092.07711	validation-rmse:2054.73630
[2]	train-rmse:1549.52687	validation-rmse:1526.30592
[1]	train-rmse:2092.07711	validation-rmse:2054.73630
[2]	train-rmse:1549.52687	validation-rmse:1526.30592
[3]	train-rmse:1184.46798	validation-rmse:1174.90119
[4]	train-rmse:941.09127	validation-rmse:943.28272
[5]	train-rmse:784.58014	validation-rmse:796.09651
[6]	train-rmse:685.75110	validation-rmse:705.22245
[7]	train-rmse:624.67281	validation-rmse:653.32563
[8]	train-rmse:584.19599	validation-rmse:620.30404
[9]	train-rmse:558.77667	validation-rmse:599.24504
[10]	train-rmse:543.85303	validation-rmse:586.99790
[11]	train-rmse:531.92694	validation-rmse:578.68120
[12]	train-rmse:523.08456	validation-rmse:571.73527
[13]	train-rmse:515.67753	validation-rmse:567.19913
[14]	train-rmse:510.77594	validation-rmse:564.66402
[15]	train-rmse:506.68519	validation-rmse:563.21547
[16]	train-rmse:502.96796	validation-rmse:561.80880
[17]	train-r

### Early stopping

In [11]:
# XGBoost early stopping, stop when validation loss doesn't improve for 50 consecutive rounds.
n = 10000

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,  # print loss every 50 iters
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	train-rmse:2874.29379	validation-rmse:2817.38773

[50]	train-rmse:430.07110	validation-rmse:553.50718
[90]	train-rmse:382.65353	validation-rmse:555.74725


### Cross validation

In [12]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.530912,9.57651,2877.437274,37.09354
1,2089.327469,8.31729,2094.021636,24.828795
2,1550.617973,5.223297,1558.386252,18.540267
3,1183.812759,5.19342,1195.032441,13.47158
4,941.203113,4.539805,958.728828,9.479449


In [13]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

549.311480649509

## XGBoost Classification

In [14]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [21]:
diamonds['cut'].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [15]:
from sklearn.preprocessing import OrdinalEncoder

X = diamonds.drop("cut", axis=1)
y = diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features (categorical features)
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [20]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [22]:
# classification objectives:
# binary:logistic - binary classification
# multi:softprob - multi-class classification
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [24]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [25]:
results.head()

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,train-auc-mean,train-auc-std,train-merror-mean,train-merror-std,test-mlogloss-mean,test-mlogloss-std,test-auc-mean,test-auc-std,test-merror-mean,test-merror-std
0,1.257409,0.000782,0.892285,0.000563,0.25571,0.000651,1.260793,0.001452,0.887334,0.001921,0.259523,0.002094
1,1.073167,0.001003,0.897218,0.00016,0.253504,0.000926,1.079268,0.002086,0.891691,0.001992,0.256878,0.002851
2,0.954967,0.001302,0.900405,0.000719,0.251619,0.001775,0.964115,0.002876,0.89442,0.001655,0.255395,0.002223
3,0.874072,0.001335,0.903007,0.00039,0.250031,0.001914,0.886048,0.003267,0.896543,0.001724,0.254554,0.002085
4,0.815379,0.001882,0.905992,0.001058,0.248492,0.001517,0.830267,0.003249,0.898867,0.001481,0.25359,0.002531


In [26]:
results['test-auc-mean'].max()

0.9402233623451636