# Categorical Handling with XGBoost

In [4]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import multiprocessing
import random
import io




from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
import timeit

## Dataset

Leamos el dataset a utilizar.

In [5]:
from google.colab import files
uploaded = files.upload()

In [6]:
X = pd.read_csv(io.BytesIO(uploaded['healthcare.csv']), sep = ";")
y = X['exitus']
X = X.drop(['exitus'], axis = 1)

In [7]:
perc_values = [0.7, 0.15, 0.15];

Split **based on time**.

In [8]:
# dimensiones de los conjuntos de train y test
n_train = int(X.shape[0] * perc_values[0])
n_val = int(X.shape[0] * perc_values[1])
n_test = int(X.shape[0] * perc_values[2])

In [9]:
categorical_vars = set(['severity', 'origin', 'tip_adm', 'tip_grd', 'date'])
numerical_vars = set(X.columns) - categorical_vars
categorical_vars = list(categorical_vars)
numerical_vars = list(numerical_vars)
ohe = OneHotEncoder(sparse_output = False)
ohe_fit = ohe.fit(X[categorical_vars])
X_ohe = pd.DataFrame(ohe.fit_transform(X[categorical_vars]))
X_ohe.columns = pd.DataFrame(ohe_fit.get_feature_names_out())
X = pd.concat((X_ohe, X[numerical_vars].reset_index()), axis=1)


# selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

# selección del conjunto de validación
X_val = X.iloc[(n_train):(n_train+n_val)]
y_val = y.iloc[(n_train):(n_train+n_val)]

# selección del conjunto de test
X_test = X.iloc[(n_train+n_val):]
y_test = y.iloc[(n_train+n_val):]


1) Import model.

In [10]:
from xgboost import XGBClassifier as model_constructor

2) Import metric.

In [11]:
from sklearn.metrics import roc_auc_score as metric

3) Define model.

In [12]:
model = model_constructor(eval_metric = "auc",
                          early_stopping_rounds = 10,
                          random_state = 1)

4) Train model.

In [13]:
start = timeit.default_timer()
model.fit(X_train,
          np.array(y_train),
          eval_set=[(X_val, y_val)],
          verbose=True)
time = timeit.default_timer() - start

[0]	validation_0-auc:0.91698
[1]	validation_0-auc:0.91684
[2]	validation_0-auc:0.91877
[3]	validation_0-auc:0.92390
[4]	validation_0-auc:0.92573
[5]	validation_0-auc:0.92136
[6]	validation_0-auc:0.90169
[7]	validation_0-auc:0.90850
[8]	validation_0-auc:0.91820
[9]	validation_0-auc:0.92077
[10]	validation_0-auc:0.92101
[11]	validation_0-auc:0.92404
[12]	validation_0-auc:0.92377
[13]	validation_0-auc:0.92449


5) Predict.

In [14]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)

6) Evaluate.

In [15]:
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1]);
auc_val = metric(y_val, pred_val_p[:,1]);
auc_test = metric(y_test, pred_test_p[:,1]);
results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':['XGBoost OHE'],'auc_train':[auc_train],
                                            'auc_val':[auc_val],
                                            'auc_test':[auc_test],
                                            'time':[time]}, columns=['model',  'auc_train','auc_val', 'auc_test', 'time']),
                         ignore_index=True)
results

  results = results.append(pd.DataFrame(data={'model':['XGBoost OHE'],'auc_train':[auc_train],


Unnamed: 0,model,auc_train,auc_val,auc_test,time
0,XGBoost OHE,0.958681,0.925734,0.915126,0.54376


### Versión 2: OHE + tree_method = 'hist'

XGBoost offers different ways to choose partitions and build trees. A very popular one is **tree_method = 'hist'**, which allows to speed up models when working with large datasets (similar approach to LightGBM).

Predictive performance may change, but there is not a clear winner between using 'hist' or other three_methods.

3) Define model.

In [16]:
model = model_constructor(eval_metric = "auc",
                          early_stopping_rounds = 10,
                          random_state = 1,
                          tree_method = 'hist')

4) Train the model.

In [17]:
start = timeit.default_timer()
model.fit(X_train,
          np.array(y_train),
          eval_set=[(X_val, y_val)],
          verbose=True)
time = timeit.default_timer() - start

[0]	validation_0-auc:0.89980
[1]	validation_0-auc:0.91890
[2]	validation_0-auc:0.91864
[3]	validation_0-auc:0.92732
[4]	validation_0-auc:0.92884
[5]	validation_0-auc:0.92690
[6]	validation_0-auc:0.92454
[7]	validation_0-auc:0.92499
[8]	validation_0-auc:0.92714
[9]	validation_0-auc:0.92957
[10]	validation_0-auc:0.93209
[11]	validation_0-auc:0.93342
[12]	validation_0-auc:0.93412
[13]	validation_0-auc:0.93576
[14]	validation_0-auc:0.93528
[15]	validation_0-auc:0.93528
[16]	validation_0-auc:0.93533
[17]	validation_0-auc:0.93746
[18]	validation_0-auc:0.93785
[19]	validation_0-auc:0.93795
[20]	validation_0-auc:0.93788
[21]	validation_0-auc:0.93769
[22]	validation_0-auc:0.93559
[23]	validation_0-auc:0.93532
[24]	validation_0-auc:0.93572
[25]	validation_0-auc:0.93549
[26]	validation_0-auc:0.93486
[27]	validation_0-auc:0.93484
[28]	validation_0-auc:0.93495


5) Predict.

In [18]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)

6) Evaluate.

In [19]:
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1]);
auc_val = metric(y_val, pred_val_p[:,1]);
auc_test = metric(y_test, pred_test_p[:,1]);
results = results.append(pd.DataFrame(data={'model':['XGBoost OHE + hist'],'auc_train':[auc_train],
                                            'auc_val':[auc_val],
                                            'auc_test':[auc_test],
                                            'time':[time]}, columns=['model',  'auc_train','auc_val', 'auc_test', 'time']),
                         ignore_index=True)
results

  results = results.append(pd.DataFrame(data={'model':['XGBoost OHE + hist'],'auc_train':[auc_train],


Unnamed: 0,model,auc_train,auc_val,auc_test,time
0,XGBoost OHE,0.958681,0.925734,0.915126,0.54376
1,XGBoost OHE + hist,0.971118,0.937955,0.931165,0.28372


### Versión 3: tree_method = 'hist'

This time we will not perform any preprocessing step, so **we will not perform encoding**. For this we must indicate that the variables are categorical.

In [20]:
X = pd.read_csv(io.BytesIO(uploaded['healthcare.csv']), sep = ";")
y = X['exitus']
X = X.drop(['exitus'], axis = 1)

In [21]:
X[['severity', 'origin', 'tip_adm', 'tip_grd', 'date']] = X[['severity', 'origin', 'tip_adm', 'tip_grd', 'date']].astype("category")
X.dtypes

date               category
severity           category
mortality_ratio     float64
age                   int64
num_proc              int64
ambulatory            int64
origin             category
expected_length       int64
tip_grd            category
tip_adm            category
dtype: object

In [22]:
# selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

# selección del conjunto de validación
X_val = X.iloc[(n_train):(n_train+n_val)]
y_val = y.iloc[(n_train):(n_train+n_val)]

# selección del conjunto de test
X_test = X.iloc[(n_train+n_val):]
y_test = y.iloc[(n_train+n_val):]

3) Define model

To enable categorical variable handling we have to set the *enable_categorical* flag. For it to work it is mandatory to set tree_method = 'hist', or tree_method = 'gpu_hist'.

In [23]:
model = model_constructor(eval_metric = "auc",
                        early_stopping_rounds = 10,
                        tree_method = 'hist',
                        enable_categorical = True,
                        random_state = 1,)

4) Train model.


In [24]:
start = timeit.default_timer()
model.fit(X_train,
          np.array(y_train),
          eval_set=[(X_val, y_val)],
          verbose=True)
time = timeit.default_timer() - start

[0]	validation_0-auc:0.90243
[1]	validation_0-auc:0.92402
[2]	validation_0-auc:0.92488
[3]	validation_0-auc:0.93737
[4]	validation_0-auc:0.93679
[5]	validation_0-auc:0.93774
[6]	validation_0-auc:0.93745
[7]	validation_0-auc:0.93703
[8]	validation_0-auc:0.93734
[9]	validation_0-auc:0.93719
[10]	validation_0-auc:0.93761
[11]	validation_0-auc:0.93803
[12]	validation_0-auc:0.93904
[13]	validation_0-auc:0.93892
[14]	validation_0-auc:0.93927
[15]	validation_0-auc:0.93971
[16]	validation_0-auc:0.93930
[17]	validation_0-auc:0.94100
[18]	validation_0-auc:0.94146
[19]	validation_0-auc:0.94128
[20]	validation_0-auc:0.94125
[21]	validation_0-auc:0.94094
[22]	validation_0-auc:0.94094
[23]	validation_0-auc:0.94118
[24]	validation_0-auc:0.94116
[25]	validation_0-auc:0.94082
[26]	validation_0-auc:0.94084
[27]	validation_0-auc:0.94079


5) Predict.

In [25]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)

6) Evaluate.

In [26]:
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1]);
auc_val = metric(y_val, pred_val_p[:,1]);
auc_test = metric(y_test, pred_test_p[:,1]);
results = results.append(pd.DataFrame(data={'model':['XGBoost internal handling + hist'],'auc_train':[auc_train],
                                            'auc_val':[auc_val],
                                            'auc_test':[auc_test],
                                            'time':[time]}, columns=['model',  'auc_train','auc_val', 'auc_test', 'time']), ignore_index=True)
results

  results = results.append(pd.DataFrame(data={'model':['XGBoost internal handling + hist'],'auc_train':[auc_train],


Unnamed: 0,model,auc_train,auc_val,auc_test,time
0,XGBoost OHE,0.958681,0.925734,0.915126,0.54376
1,XGBoost OHE + hist,0.971118,0.937955,0.931165,0.28372
2,XGBoost internal handling + hist,0.970515,0.942455,0.935135,0.278032


Let's see what is the best approach....

In [27]:
results['model'][results['auc_test'].idxmax()]

'XGBoost internal handling + hist'