In [1]:
import pandas as pd

In [2]:
dat = pd.read_csv('../data/dataset_mock_final.csv', sep=';')

In [3]:
dat.head()

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus,dataset
0,2016-07,,0.001193,15603.0,4.0,,,7.0,M,1.0,0,train
1,2016-05,1.0,0.0,14285.0,3.0,,1.0,,M,1.0,0,train
2,2016-01,,0.0,6046.0,2.0,,,2.0,,1.0,0,train
3,2016-01,1.0,0.00406,27340.0,4.0,,2.0,9.0,Q,,0,train
4,2016-05,2.0,0.028365,28685.0,10.0,0.0,,9.0,M,1.0,0,train


In [4]:
dat.drop('date', axis = 1, inplace = True)

In [5]:
cat_var = ['severity', 'ambulatory', 'origin', 'tip_grd', 'tip_adm']
non_cat_var = list(set(dat.columns) - set(cat_var))
num_var = list(set(dat.columns) - set(cat_var) - {'dataset', 'exitus'})

In [6]:
dat.isna().any()

severity            True
mortality_ratio     True
age                 True
num_proc            True
ambulatory          True
origin              True
expected_length     True
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [7]:
from sklearn.ensemble import RandomForestRegressor
from fancyimpute import IterativeImputer as MICE

# 3) Define "model"
model = MICE(estimator=RandomForestRegressor())

# 4) Train "model"
model.fit(dat[num_var][dat['dataset'] == 'train'])

# 5) "Predict"
dat[num_var] = model.transform(dat[num_var])
dat.isna().any()



severity            True
mortality_ratio    False
age                False
num_proc           False
ambulatory          True
origin              True
expected_length    False
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [8]:
dat[cat_var] = dat[cat_var].astype('str')

In [9]:
dat.loc[dat['dataset'] == 'train', cat_var] = dat.loc[dat['dataset'] == 'train', cat_var].fillna('UNKNOWN')
dat[cat_var][dat['dataset'] == 'train'].isna().sum()

severity      0
ambulatory    0
origin        0
tip_grd       0
tip_adm       0
dtype: int64

In [10]:
dat.isna().any()

severity           False
mortality_ratio    False
age                False
num_proc           False
ambulatory         False
origin             False
expected_length    False
tip_grd            False
tip_adm            False
exitus             False
dataset            False
dtype: bool

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False, drop='first')

# 4) Training model
ohe.fit(dat[cat_var][dat['dataset'] == 'train'])

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(dat[cat_var]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()
dat = pd.concat((dat[non_cat_var], dat_ohe), axis=1)

In [12]:
100*dat.groupby(['exitus'])['exitus'].agg(['count'])/dat.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,96.235664
1,3.764336


In [13]:
def compute_sampling_strategy(frac_minority, minority_count, majority_count):
    synthetic_samples = (frac_minority * majority_count - (1 - frac_minority) * minority_count) / (1 - frac_minority)
    strategy = (minority_count + synthetic_samples) / majority_count
    return strategy

# Assume you have counts for your classes
minority_count = sum(dat['exitus'] == 1)
majority_count = sum(dat['exitus'] == 0)

# For a 10-90 split:
fraction = 0.1
sampling_value = compute_sampling_strategy(fraction, minority_count, majority_count)
print(f"For a {fraction*100}% minority class after oversampling, set sampling_strategy to {sampling_value:.2f} in SMOTE.")

For a 10.0% minority class after oversampling, set sampling_strategy to 0.11 in SMOTE.


In [14]:
%%capture
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy =sampling_value,
           random_state = 0,
           k_neighbors = 5)

X_res, y_res = sm.fit_resample(dat.drop(['exitus', 'dataset'], axis = 1), dat['exitus'])

X_res['exitus'] = y_res

X_res['dataset'] = 'train'

dat_new = pd.concat([X_res, dat[dat['dataset'] == 'val'], dat[dat['dataset'] == 'test']])

# Checking the class distribution after SMOTE


In [15]:
100*X_res.exitus.value_counts()/X_res.shape[0]

exitus
0    90.000865
1     9.999135
Name: count, dtype: float64

# Model Random Forest

In [16]:
from sklearn.metrics import roc_auc_score as metric

In [17]:
from sklearn.ensemble import RandomForestClassifier as model_constructor

In [18]:
# Random Forest
n_estimators_values = [120, 140, 150]
max_features_values = [3, 4, 5, 6]
max_samples_values = [60, 80, 100, 120]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}


In [19]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].exitus.values)


                        # [5] Predict
                        pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
                        pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!

                        # [6] Compute metric
                        metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
                        metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1



Iteracion = 1
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 2
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 3
Metric train = 0.97 - Metric validation = 0.92.
Iteracion = 4
Metric train = 0.98 - Metric validation = 0.92.
Iteracion = 5
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 6
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 7
Metric train = 0.97 - Metric validation = 0.92.
Iteracion = 8
Metric train = 0.98 - Metric validation = 0.92.
Iteracion = 9
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 10
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 11
Metric train = 0.97 - Metric validation = 0.92.
Iteracion = 12
Metric train = 0.98 - Metric validation = 0.92.
Iteracion = 13
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 14
Metric train = 0.96 - Metric validation = 0.92.
Iteracion = 15
Metric train = 0.97 - Metric validation = 0.92.
Iteracion = 16
Metric train = 0.98 - Metric validation = 0.92.
I

In [20]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model = grid_results.iloc[0]
best_model

max_features      5.000000
n_estimators    150.000000
max_samples      60.000000
metric_train      0.958636
metric_val        0.929833
Name: 33, dtype: float64

In [21]:
model =  model_constructor(max_features = int(best_model['max_features']),
                                                  n_estimators = int(best_model['n_estimators']),
                                                  max_samples = int(best_model['max_samples']),
                                                  random_state = 0)

In [22]:
# [4] Train model
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [23]:
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.9586 - Metric val = 0.9298 - Metric test = 0.9284


# XGBoost

In [24]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [25]:
# Xgboost
n_estimators_values = [1000]
learning_rate_values = [1]
gamma_values = [1, 10]
max_depth_values = [5, 6, 8]
min_child_weight_values = [20, 30, 40]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [10]

params_grid = {'n_estimators': n_estimators_values,
                  'learning_rate': learning_rate_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values}

In [26]:
%%capture
num_iter = 1
grid_results = pd.DataFrame(columns = ('n_estimators',
                                       'learning_rate',
                                       'gamma',
                                       'max_depth',
                                       'min_child_weight',
                                       'subsample',
                                       'colsample_bytree',
                                       'num_parallel_tree',
                                       'best_iteration',
                                       'metric_train',
                                       'metric_val'))

for n_estimators in params_grid['n_estimators']:
    for learning_rate in params_grid['learning_rate']:
        for gamma in params_grid['gamma']:
            for max_depth in params_grid['max_depth']:
                for min_child_weight in params_grid['min_child_weight']:
                    for subsample in params_grid['subsample']:
                        for colsample_bytree in params_grid['colsample_bytree']:
                            for num_parallel_tree in params_grid['num_parallel_tree']:




                                                # Print trace
                                                print('Iteration = ' + str(num_iter))

                                                # [3] Define model
                                                model = model_constructor(n_estimators = n_estimators,
                                                                      learning_rate = learning_rate,
                                                                      gamma = gamma,
                                                                      max_depth = max_depth,
                                                                      min_child_weight = min_child_weight ,
                                                                      subsample = subsample,
                                                                      colsample_bytree = colsample_bytree,
                                                                      num_parallel_tree = num_parallel_tree,
                                                                      early_stopping_rounds = 10,
                                                                      eval_metric = "auc",
                                                                      random_state = 0) # nthread!!!

                                                # [4] Train model
                                                model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values,
                                                          eval_set=[(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'val'].exitus.values)],
                                                          verbose = False)
                                                best_iteration = model.best_iteration

                                                # [5] Predict
                                                pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!
                                                pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!

                                                # [6] Compute metric
                                                metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:, 1])
                                                metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:, 1])

                                                # print error
                                                print('AUC train = %.2f - AUC validation = %.2f.'
                                                      % (metric_train, metric_val))

                                                # Save iteration results
                                                grid_results.loc[num_iter]=[n_estimators,
                                                                            learning_rate,
                                                                            gamma,
                                                                            max_depth,
                                                                            min_child_weight,
                                                                            subsample,
                                                                            colsample_bytree,
                                                                            num_parallel_tree,
                                                                            best_iteration,
                                                                            metric_train,
                                                                            metric_val]
                                                num_iter += 1

In [27]:
grid_results_xgb = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model_xgb = grid_results_xgb.iloc[0]
best_model_xgb

n_estimators         1000.000000
learning_rate           1.000000
gamma                  10.000000
max_depth               5.000000
min_child_weight       20.000000
subsample               1.000000
colsample_bytree        0.100000
num_parallel_tree      10.000000
best_iteration          1.000000
metric_train            0.918223
metric_val              0.907949
Name: 39, dtype: float64

In [28]:
# [3] define model
model = model_constructor(n_estimators = int(best_model_xgb.best_iteration),
                          learning_rate = best_model_xgb.learning_rate,
                          gamma = best_model_xgb.gamma,
                          max_depth = int(best_model_xgb.max_depth),
                          min_child_weight = best_model_xgb.min_child_weight,
                          subsample = best_model_xgb.subsample,
                          colsample_bytree = best_model_xgb.colsample_bytree,
                          num_parallel_tree = int(best_model_xgb.num_parallel_tree),
                          random_state = 0)

In [29]:
%%capture
# [4] Train model
model.fit(dat[dat['dataset'] != 'test'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] != 'test'].exitus.values)

In [30]:
%%capture
pred_train_p = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1))
pred_val_p = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1))
pred_test_p = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1))

In [31]:
# Calcular métricas de evaluación
auc_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train_p[:,1])
auc_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val_p[:,1])
auc_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test_p[:,1])

# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.9164 - Metric val = 0.9324 - Metric test = 0.9211


In [32]:
print('Random Forest Results')
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (metric_train, metric_val, metric_test))

Random Forest Results
Metric train = 0.8382 - Metric val = 0.8291 - Metric test = 0.9284


# SVM Classification

In [33]:
from sklearn.svm import SVC as model_constructor

In [34]:
import numpy as np

# Get Cherksassky parameters --> This is optional!!!
d = dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1).shape[1]
m = np.mean(dat[dat['dataset']  == 'train'].exitus.values)
s = np.std(dat[dat['dataset']  == 'train'].exitus.values)

C_cherk = np.max([np.abs(m + 3*s),np.abs(m - 3*s)])
gamma_cherk = np.power(0.2, 1/d)

In [40]:
# SVM
C_values = [0.4, 0.5, 0.6]
gamma_values = [0.9, 1]

params_grid = {'C': C_values,
               'gamma': gamma_values}

In [41]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('C',
                                       'gamma',
                                       'metric_train',
                                       'metric_val'))

for C in params_grid['C']:
    for gamma in params_grid['gamma']:

                    # Print trace
                    print('Iteration = ' + str(num_iter))

                    # [3] Define model
                    model = model_constructor(C = C,
                                              gamma = gamma,
                                              probability = True,
                                              random_state = 0) # Probability = True!!!

                    # [4] Train model
                    model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset']  == 'train'].exitus.values)

                    # [5] Predict
                    pred_train = model.predict_proba(dat[dat['dataset']  == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!
                    pred_val = model.predict_proba(dat[dat['dataset']  == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!

                    # [6] Compute metric
                    metric_train = metric(dat[dat['dataset']  == 'train'].exitus.values, pred_train[:,1])
                    metric_val = metric(dat[dat['dataset']  == 'val'].exitus.values, pred_val[:,1])

                    # print error
                    print('AUC train = %.2f - AUC validation = %.2f.'
                          % (metric_train, metric_val))

                    # Save iteration results
                    grid_results.loc[num_iter]=[C,
                                                gamma,
                                                metric_train,
                                                metric_val]
                    num_iter += 1

Iteration = 1
AUC train = 0.00 - AUC validation = 0.43.
Iteration = 2
AUC train = 0.00 - AUC validation = 0.43.
Iteration = 3
AUC train = 0.00 - AUC validation = 0.43.
Iteration = 4
AUC train = 0.00 - AUC validation = 0.43.
Iteration = 5
AUC train = 0.00 - AUC validation = 0.43.
Iteration = 6
AUC train = 0.00 - AUC validation = 0.43.


In [37]:
grid_results_svm = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model_svm = grid_results_svm.iloc[0]
best_model_svm

C               0.594870
gamma           0.929456
metric_train    0.000000
metric_val      0.429601
Name: 4, dtype: float64

In [38]:
# [3] Define model
model = model_constructor(C = best_model_svm.C,
                           gamma = best_model_svm.gamma,
                           probability = True,
                           random_state = 0) # Probability = True!!!

# [4] Train model
model.fit(dat[dat['dataset'] != 'test'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] != 'test'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 1.00 - Metric val = 1.00 - Metric test = 0.67
