# Missing Values with XGBoost

In [None]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import multiprocessing
import random
import io




from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
import timeit

## Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving healthcare_missing.csv to healthcare_missing.csv


In [None]:
X = pd.read_csv(io.BytesIO(uploaded['healthcare_missing.csv']), sep = ",")
y = X['exitus']
X = X.drop(['exitus'], axis = 1)

In this case, **we have missing values** in our dataset.

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32706 entries, 0 to 32705
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             32706 non-null  object 
 1   severity         12072 non-null  float64
 2   mortality_ratio  30438 non-null  float64
 3   age              32282 non-null  float64
 4   num_proc         31356 non-null  float64
 5   ambulatory       1959 non-null   float64
 6   origin           12246 non-null  float64
 7   expected_length  30833 non-null  float64
 8   tip_grd          19090 non-null  object 
 9   tip_adm          26617 non-null  float64
dtypes: float64(8), object(2)
memory usage: 2.5+ MB


## Preprocessing

### One-Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
categorical_vars = set(['severity', 'origin', 'tip_adm', 'tip_grd', 'date'])
numerical_vars = set(X.columns) - categorical_vars
categorical_vars = list(categorical_vars)
numerical_vars = list(numerical_vars)
ohe = OneHotEncoder(sparse_output = False)
ohe_fit = ohe.fit(X[categorical_vars])
X_ohe = pd.DataFrame(ohe.fit_transform(X[categorical_vars]))
X = pd.concat((X_ohe, X[numerical_vars].reset_index()), axis=1)
X.columns = X.columns.astype(str)

In [None]:
perc_values = [0.7, 0.15, 0.15];

# dimensiones de los conjuntos de train y test
n_train = int(X.shape[0] * perc_values[0])
n_val = int(X.shape[0] * perc_values[1])
n_test = int(X.shape[0] * perc_values[2])

# selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

# selección del conjunto de validación
X_val = X.iloc[(n_train):(n_train+n_val)]
y_val = y.iloc[(n_train):(n_train+n_val)]

# selección del conjunto de test
X_test = X.iloc[(n_train+n_val):]
y_test = y.iloc[(n_train+n_val):]

## SVM

1) Import model.

In [None]:
from sklearn.svm import SVC as model_constructor

2) Import metric.

In [None]:
from sklearn.metrics import roc_auc_score as metric

3) Define model.

In [None]:
model = model_constructor(random_state = 1)

4) Train model.

In [None]:
model.fit(X_train,
          np.array(y_train))

ValueError: ignored

As we saw in the lectures, most models **do not accept missing values**. However, **tree-based models** are an **exception**.


## XGBoost

Let's try now with XGBoost.

1) Import model.

In [None]:
from xgboost import XGBClassifier as model_constructor

3) Define model.

In [None]:
model = model_constructor(random_state = 1,
                          eval_metric = "auc",
                          early_stopping_rounds = 10)

4) Train model.

In [None]:
model.fit(X_train,
          np.array(y_train),
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.89973
[1]	validation_0-auc:0.91525
[2]	validation_0-auc:0.91774
[3]	validation_0-auc:0.91067
[4]	validation_0-auc:0.90782
[5]	validation_0-auc:0.91180
[6]	validation_0-auc:0.92061
[7]	validation_0-auc:0.91752
[8]	validation_0-auc:0.92060
[9]	validation_0-auc:0.91927
[10]	validation_0-auc:0.92122
[11]	validation_0-auc:0.92396
[12]	validation_0-auc:0.92536
[13]	validation_0-auc:0.92517
[14]	validation_0-auc:0.92569
[15]	validation_0-auc:0.92677
[16]	validation_0-auc:0.92879
[17]	validation_0-auc:0.92844
[18]	validation_0-auc:0.92805
[19]	validation_0-auc:0.92805
[20]	validation_0-auc:0.92731
[21]	validation_0-auc:0.92756
[22]	validation_0-auc:0.92185
[23]	validation_0-auc:0.92296
[24]	validation_0-auc:0.92370
[25]	validation_0-auc:0.92053


It works!

5) Predict.

In [None]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)

6) Evaluate.

In [None]:
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1]);
auc_val = metric(y_val, pred_val_p[:,1]);
auc_test = metric(y_test, pred_test_p[:,1]);
results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':['XGBoost (Default)'],'auc_train':[auc_train],'auc_val':[auc_val],'auc_test':[auc_test]}, columns=['model',  'auc_train','auc_val', 'auc_test']), ignore_index=True)
results

  results = results.append(pd.DataFrame(data={'model':['XGBoost (Default)'],'auc_train':[auc_train],'auc_val':[auc_val],'auc_test':[auc_test]}, columns=['model',  'auc_train','auc_val', 'auc_test']), ignore_index=True)


Unnamed: 0,model,auc_train,auc_val,auc_test
0,XGBoost (Default),0.970404,0.928788,0.924611


Let's compare their performance with respect to a basic method such as filling with the mean.

In [1]:
numerical_vars = list(set(numerical_vars) - set(['exitus']))
means = X_train[numerical_vars].apply(lambda x: np.mean(x)).to_dict()
X_train = X_train.fillna(value = means, axis = 0)
X_val = X_val.fillna(value = means, axis = 0)
X_test = X_test.fillna(value = means, axis = 0)

NameError: ignored

3) Define model.

In [None]:
model = model_constructor(random_state = 1,
                          eval_metric = "auc",
                          early_stopping_rounds = 10,)

4) Train model.

In [None]:
model.fit(X_train,
          np.array(y_train),
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.90076
[1]	validation_0-auc:0.91611
[2]	validation_0-auc:0.91815
[3]	validation_0-auc:0.91183
[4]	validation_0-auc:0.90665
[5]	validation_0-auc:0.91775
[6]	validation_0-auc:0.91534
[7]	validation_0-auc:0.91279
[8]	validation_0-auc:0.91802
[9]	validation_0-auc:0.91735
[10]	validation_0-auc:0.91910
[11]	validation_0-auc:0.92132
[12]	validation_0-auc:0.92542
[13]	validation_0-auc:0.92540
[14]	validation_0-auc:0.92655
[15]	validation_0-auc:0.92706
[16]	validation_0-auc:0.92868
[17]	validation_0-auc:0.93025
[18]	validation_0-auc:0.93078
[19]	validation_0-auc:0.93074
[20]	validation_0-auc:0.93078
[21]	validation_0-auc:0.93099
[22]	validation_0-auc:0.92544
[23]	validation_0-auc:0.92546
[24]	validation_0-auc:0.92434
[25]	validation_0-auc:0.92454
[26]	validation_0-auc:0.92471
[27]	validation_0-auc:0.92391
[28]	validation_0-auc:0.92318
[29]	validation_0-auc:0.92373
[30]	validation_0-auc:0.92408


5) Predict.

In [None]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)

6) Evaluate.

In [None]:
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1]);
auc_val = metric(y_val, pred_val_p[:,1]);
auc_test = metric(y_test, pred_test_p[:,1]);
results = results.append(pd.DataFrame(data={'model':['XGBoost fill missing'],'auc_train':[auc_train],'auc_val':[auc_val],'auc_test':[auc_test]}, columns=['model',  'auc_train','auc_val', 'auc_test']), ignore_index=True)
results

  results = results.append(pd.DataFrame(data={'model':['XGBoost fill missing'],'auc_train':[auc_train],'auc_val':[auc_val],'auc_test':[auc_test]}, columns=['model',  'auc_train','auc_val', 'auc_test']), ignore_index=True)


Unnamed: 0,model,auc_train,auc_val,auc_test
0,XGBoost (Default),0.970404,0.928788,0.924611
1,XGBoost fill missing,0.970084,0.930988,0.925687
