In [89]:
import pandas as pd

In [90]:
dat = pd.read_csv('../data/dataset_mock_final.csv', sep=';')

In [91]:
dat.head()

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus,dataset
0,2016-07,,0.001193,15603.0,4.0,,,7.0,M,1.0,0,train
1,2016-05,1.0,0.0,14285.0,3.0,,1.0,,M,1.0,0,train
2,2016-01,,0.0,6046.0,2.0,,,2.0,,1.0,0,train
3,2016-01,1.0,0.00406,27340.0,4.0,,2.0,9.0,Q,,0,train
4,2016-05,2.0,0.028365,28685.0,10.0,0.0,,9.0,M,1.0,0,train


In [92]:
dat.drop('date', axis = 1, inplace = True)

In [93]:
cat_var = ['severity', 'ambulatory', 'origin', 'tip_grd', 'tip_adm']
non_cat_var = list(set(dat.columns) - set(cat_var))
num_var = list(set(dat.columns) - set(cat_var) - {'dataset', 'exitus'})

In [94]:
dat.isna().any()

severity            True
mortality_ratio     True
age                 True
num_proc            True
ambulatory          True
origin              True
expected_length     True
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [95]:
from sklearn.ensemble import RandomForestRegressor
from fancyimpute import IterativeImputer as MICE

# 3) Define "model"
model = MICE(estimator=RandomForestRegressor())

# 4) Train "model"
model.fit(dat[num_var][dat['dataset'] == 'train'])

# 5) "Predict"
dat[num_var] = model.transform(dat[num_var])
dat.isna().any()



severity            True
mortality_ratio    False
age                False
num_proc           False
ambulatory          True
origin              True
expected_length    False
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [96]:
dat[cat_var] = dat[cat_var].astype('str')

In [97]:
dat.loc[dat['dataset'] == 'train', cat_var] = dat.loc[dat['dataset'] == 'train', cat_var].fillna('UNKNOWN')
dat[cat_var][dat['dataset'] == 'train'].isna().sum()

severity      0
ambulatory    0
origin        0
tip_grd       0
tip_adm       0
dtype: int64

In [98]:
dat.isna().any()

severity           False
mortality_ratio    False
age                False
num_proc           False
ambulatory         False
origin             False
expected_length    False
tip_grd            False
tip_adm            False
exitus             False
dataset            False
dtype: bool

In [99]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False, drop='first')

# 4) Training model
ohe.fit(dat[cat_var][dat['dataset'] == 'train'])

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(dat[cat_var]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()
dat = pd.concat((dat[non_cat_var], dat_ohe), axis=1)

In [100]:
100*dat.groupby(['exitus'])['exitus'].agg(['count'])/dat.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,96.235664
1,3.764336


In [101]:
def compute_sampling_strategy(frac_minority, minority_count, majority_count):
    synthetic_samples = (frac_minority * majority_count - (1 - frac_minority) * minority_count) / (1 - frac_minority)
    strategy = (minority_count + synthetic_samples) / majority_count
    return strategy

# Assume you have counts for your classes
minority_count = sum(dat['exitus'] == 1)
majority_count = sum(dat['exitus'] == 0)

# For a 10-90 split:
fraction = 0.1
sampling_value = compute_sampling_strategy(fraction, minority_count, majority_count)
print(f"For a {fraction*100}% minority class after oversampling, set sampling_strategy to {sampling_value:.2f} in SMOTE.")

For a 10.0% minority class after oversampling, set sampling_strategy to 0.11 in SMOTE.


In [102]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy =sampling_value,
           random_state = 0,
           k_neighbors = 5)

X_res, y_res = sm.fit_resample(dat.drop(['exitus', 'dataset'], axis = 1), dat['exitus'])

X_res['exitus'] = y_res

X_res['dataset'] = 'train'

dat_new = pd.concat([X_res, dat[dat['dataset'] == 'val'], dat[dat['dataset'] == 'test']])

# Checking the class distribution after SMOTE
100*X_res.exitus.value_counts()/X_res.shape[0]

exitus
0    90.000865
1     9.999135
Name: count, dtype: float64

# Model Random Forest

In [103]:
from sklearn.metrics import roc_auc_score as metric

In [104]:
from sklearn.ensemble import RandomForestClassifier as model_constructor

In [218]:
# Random Forest
n_estimators_values = [100, 120, 140]
max_features_values = [4, 5, 6]
max_samples_values = [100, 1000, dat[dat['dataset'] == 'train'].shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}


In [219]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].exitus.values)


                        # [5] Predict
                        pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
                        pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!

                        # [6] Compute metric
                        metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
                        metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1



Iteracion = 1
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 2
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 3


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 4
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 5
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 6


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 7
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 8
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 9


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 10
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 11
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 12


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_f

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 13
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 14
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 15


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_f

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 16
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 17
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 18


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_f

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 19
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 20
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 21
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 22


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_f

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 23
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 24
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 25


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_f

Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 26
Metric train = 1.00 - Metric validation = 0.89.
Iteracion = 27
Metric train = 1.00 - Metric validation = 0.89.


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
Parameters: { "max_features", "max_samples" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is

In [107]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model = grid_results.iloc[0]
best_model

max_features      5.000000
n_estimators    140.000000
max_samples     100.000000
metric_train      0.968764
metric_val        0.921079
Name: 16, dtype: float64

In [108]:
model =  model_constructor(max_features = int(best_model['max_features']),
                                                  n_estimators = int(best_model['n_estimators']),
                                                  max_samples = int(best_model['max_samples']),
                                                  random_state = 0)

In [109]:
# [4] Train model
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [217]:
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.9688 - Metric val = 0.9211 - Metric test = 0.9194


# XGBoost

In [127]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [212]:
model = model_constructor(eval_metric="auc",
                          early_stopping_rounds=30,
                          max_depth=8,
                          random_state=1,
                          learning_rate=0.05,
                          colsample_bytree = 0.5,
                          alpha=5,
                          reg_lambda=20,
                          n_estimators=2000)

In [213]:
import numpy as np
import timeit

start = timeit.default_timer()
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
          np.array(dat[dat['dataset'] == 'train'].exitus.values),
          eval_set=[(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'val'].exitus.values)],
          verbose=True)
time = timeit.default_timer() - start

[0]	validation_0-auc:0.84925
[1]	validation_0-auc:0.85053
[2]	validation_0-auc:0.85001
[3]	validation_0-auc:0.85092
[4]	validation_0-auc:0.85021
[5]	validation_0-auc:0.89049
[6]	validation_0-auc:0.89049
[7]	validation_0-auc:0.89352
[8]	validation_0-auc:0.89556
[9]	validation_0-auc:0.89501
[10]	validation_0-auc:0.89320
[11]	validation_0-auc:0.89401
[12]	validation_0-auc:0.89220
[13]	validation_0-auc:0.89224
[14]	validation_0-auc:0.89413
[15]	validation_0-auc:0.90190
[16]	validation_0-auc:0.90185
[17]	validation_0-auc:0.90133
[18]	validation_0-auc:0.90118
[19]	validation_0-auc:0.90190
[20]	validation_0-auc:0.90192
[21]	validation_0-auc:0.90078
[22]	validation_0-auc:0.90103
[23]	validation_0-auc:0.90093
[24]	validation_0-auc:0.90151
[25]	validation_0-auc:0.90064
[26]	validation_0-auc:0.90174
[27]	validation_0-auc:0.90675
[28]	validation_0-auc:0.90673
[29]	validation_0-auc:0.90731
[30]	validation_0-auc:0.90704
[31]	validation_0-auc:0.90732
[32]	validation_0-auc:0.90830
[33]	validation_0-au

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


[98]	validation_0-auc:0.91784
[99]	validation_0-auc:0.91808
[100]	validation_0-auc:0.91833
[101]	validation_0-auc:0.91834
[102]	validation_0-auc:0.91821
[103]	validation_0-auc:0.91813
[104]	validation_0-auc:0.91812
[105]	validation_0-auc:0.91827
[106]	validation_0-auc:0.91831
[107]	validation_0-auc:0.91831
[108]	validation_0-auc:0.91837
[109]	validation_0-auc:0.91829
[110]	validation_0-auc:0.91842
[111]	validation_0-auc:0.91837
[112]	validation_0-auc:0.91834
[113]	validation_0-auc:0.91857
[114]	validation_0-auc:0.91854
[115]	validation_0-auc:0.91846
[116]	validation_0-auc:0.91881
[117]	validation_0-auc:0.91878
[118]	validation_0-auc:0.91873
[119]	validation_0-auc:0.91866
[120]	validation_0-auc:0.91857
[121]	validation_0-auc:0.91890
[122]	validation_0-auc:0.91894
[123]	validation_0-auc:0.91907
[124]	validation_0-auc:0.91901
[125]	validation_0-auc:0.91913
[126]	validation_0-auc:0.91912
[127]	validation_0-auc:0.91917
[128]	validation_0-auc:0.91912
[129]	validation_0-auc:0.91933
[130]	vali

In [214]:
model.best_iteration

671

In [215]:
pred_train_p = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1))
pred_val_p = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1))
pred_test_p = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1))

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [216]:
# Calcular métricas de evaluación
auc_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train_p[:,1])
auc_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val_p[:,1])
auc_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test_p[:,1])
results = pd.DataFrame()

new_data = pd.DataFrame(data={'model': ['XGBoost (Default)'], 'mse_train': [auc_train], 'mse_val': [auc_val], 'mse_test': [auc_test]}, columns=['model', 'mse_train', 'mse_val', 'mse_test'])

results = pd.concat([results, new_data], ignore_index=True)

print(results)

               model  mse_train   mse_val  mse_test
0  XGBoost (Default)   0.960466  0.924534  0.921526
