In [2]:
# Load libraries
import numpy as np
import pandas as pd
from timeit import default_timer

from sklearn.model_selection import train_test_split

In [65]:
dat = pd.read_csv('../data/dataset_mock_midterm.csv', sep = ",")

In [3]:
dat

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus,dataset
0,2016-01,,0.408730,12596.0,21.0,0.0,,151.0,Q,1.0,0,train
1,2016-01,,0.306931,20973.0,22.0,,,99.0,Q,1.0,0,train
2,2016-01,4.0,0.278481,19611.0,19.0,,,87.0,,1.0,0,train
3,2016-01,3.0,0.150289,13583.0,22.0,,,100.0,Q,,0,train
4,2016-01,1.0,0.016573,18042.0,2.0,,,44.0,Q,1.0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...
32701,2016-12,2.0,0.028365,23619.0,2.0,,,2.0,,1.0,0,test
32702,2016-12,1.0,0.000606,3935.0,1.0,,1.0,2.0,M,1.0,0,test
32703,2016-12,,0.040452,30163.0,4.0,,,2.0,M,,0,test
32704,2016-12,,0.000000,29012.0,4.0,,,0.0,,1.0,0,test


In [66]:
dat.drop('date', axis = 1, inplace = True)

In [67]:
cat_var = ['severity', 'ambulatory', 'origin', 'tip_grd', 'tip_adm']
non_cat_var = list(set(dat.columns) - set(cat_var))
num_var = list(set(dat.columns) - set(cat_var) - {'dataset', 'exitus'})

In [6]:
dat.isna().any()

severity            True
mortality_ratio     True
age                 True
num_proc            True
ambulatory          True
origin              True
expected_length     True
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [68]:
from fancyimpute import IterativeImputer as MICE
# 3) Define "model"
model = MICE()

# 4) Train "model"
model.fit(dat[num_var][dat['dataset'] == 'train'])

# 5) "Predict"
dat[num_var] = model.transform(dat[num_var])
dat.isna().any()

severity            True
mortality_ratio    False
age                False
num_proc           False
ambulatory          True
origin              True
expected_length    False
tip_grd             True
tip_adm             True
exitus             False
dataset            False
dtype: bool

In [69]:
dat[cat_var] = dat[cat_var].astype('str')

In [70]:
dat.loc[dat['dataset'] == 'train', cat_var] = dat.loc[dat['dataset'] == 'train', cat_var].fillna('UNKNOWN')
dat[cat_var][dat['dataset'] == 'train'].isna().sum()

severity      0
ambulatory    0
origin        0
tip_grd       0
tip_adm       0
dtype: int64

In [71]:
dat.isna().any()

severity           False
mortality_ratio    False
age                False
num_proc           False
ambulatory         False
origin             False
expected_length    False
tip_grd            False
tip_adm            False
exitus             False
dataset            False
dtype: bool

In [72]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False, drop='first')

# 4) Training model
ohe.fit(dat[cat_var][dat['dataset'] == 'train'])

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(dat[cat_var]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()
dat = pd.concat((dat[non_cat_var], dat_ohe), axis=1)

# Model 1 Decision Tree

In [73]:
from sklearn.tree import DecisionTreeClassifier as model_constructor_1
from sklearn.metrics import roc_auc_score as metric

In [74]:
criterion_values = ['gini', 'entropy']
max_depth_values = [5, 6, 7]
min_samples_split_values = [10, 20, 30]
min_samples_leaf_values = [29, 30, 31]
max_features_values = [None, 1, 2]

params_grid = {  'criterion': criterion_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values}

In [75]:
n = len(params_grid['max_depth'])*len(params_grid['min_samples_split'])*len(params_grid['min_samples_leaf'])*len(params_grid['max_features'])*len(params_grid['criterion'])
print(str(n)+ ' iterations of Decision Tree')

162 iterations of Decision Tree


In [76]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('criterion',
                                       'max_depth',
                                       'min_samples_split',
                                       'min_samples_leaf',
                                       'max_features',
                                       'auc_train',
                                       'auc_val',
                                       'time'))

for criterion in params_grid['criterion']:
    for max_depth in params_grid['max_depth']:
        for min_samples_split in params_grid['min_samples_split']:
            for min_samples_leaf in params_grid['min_samples_leaf']:
                for max_features in params_grid['max_features']:


                    # Start time
                    start_time = default_timer()

                    # Print trace
                    print('Iteracion = ' + str(num_iter))

                    # [3] Define model
                    model = model_constructor_1(criterion = criterion,
                                              max_depth = max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf,
                                              max_features = max_features,
                                              random_state = 0)

                    # [4] Train model
                    model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].exitus.values)

                    # [5] Predict
                    pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!
                    pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!

                    # [6] Evaluate
                    metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
                    metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])

                    # Computational time
                    time = default_timer() - start_time

                    # print error
                    print('AUC train = %.2f - AUC validation = %.2f. Time spend = %.2f.'
                          % (metric_train, metric_val, time))

                    # Save iteration results
                    grid_results.loc[num_iter]=[criterion,
                                                max_depth,
                                                min_samples_split,
                                                min_samples_leaf,
                                                max_features,
                                             metric_train,
                                             metric_val,
                                            time]
                    num_iter += 1

print('Grid Search Total Computational Time: ', np.sum(grid_results.time.values))

Iteracion = 1
AUC train = 0.94 - AUC validation = 0.93. Time spend = 0.10.
Iteracion = 2
AUC train = 0.76 - AUC validation = 0.73. Time spend = 0.04.
Iteracion = 3
AUC train = 0.83 - AUC validation = 0.81. Time spend = 0.04.
Iteracion = 4
AUC train = 0.94 - AUC validation = 0.93. Time spend = 0.08.
Iteracion = 5
AUC train = 0.76 - AUC validation = 0.73. Time spend = 0.04.
Iteracion = 6
AUC train = 0.83 - AUC validation = 0.81. Time spend = 0.04.
Iteracion = 7
AUC train = 0.94 - AUC validation = 0.93. Time spend = 0.08.
Iteracion = 8
AUC train = 0.76 - AUC validation = 0.73. Time spend = 0.04.
Iteracion = 9
AUC train = 0.83 - AUC validation = 0.81. Time spend = 0.04.
Iteracion = 10
AUC train = 0.94 - AUC validation = 0.93. Time spend = 0.10.
Iteracion = 11
AUC train = 0.76 - AUC validation = 0.73. Time spend = 0.04.
Iteracion = 12
AUC train = 0.83 - AUC validation = 0.81. Time spend = 0.04.
Iteracion = 13
AUC train = 0.94 - AUC validation = 0.93. Time spend = 0.08.
Iteracion = 14
AUC tr

In [77]:
grid_results = grid_results.sort_values(by = ['auc_val', 'auc_train', 'time'], ascending = [False, False, True])
grid_results

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,max_features,auc_train,auc_val,time
49,gini,6,30,30,,0.946581,0.935792,0.083802
40,gini,6,20,30,,0.946581,0.935792,0.084012
31,gini,6,10,30,,0.946581,0.935792,0.085513
43,gini,6,20,31,,0.946536,0.935474,0.083038
52,gini,6,30,31,,0.946536,0.935474,0.083105
...,...,...,...,...,...,...,...,...
32,gini,6,10,30,1,0.705386,0.681555,0.036915
38,gini,6,20,29,1,0.705386,0.681555,0.037149
35,gini,6,10,31,1,0.705386,0.681555,0.037361
50,gini,6,30,30,1,0.705386,0.681555,0.038527


In [78]:
best_model = grid_results.iloc[0]
best_model

criterion                gini
max_depth                   6
min_samples_split          30
min_samples_leaf           30
max_features             None
auc_train            0.946581
auc_val              0.935792
time                 0.083802
Name: 49, dtype: object

In [79]:
model  = model_constructor_1(criterion = best_model['criterion'],
                                              max_depth = best_model['max_depth'],
                                              min_samples_split = best_model['min_samples_split'],
                                              min_samples_leaf = best_model['min_samples_leaf'],
                                              max_features = best_model['max_features'],
                                              random_state = 0)

In [80]:
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)

In [81]:
# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [82]:
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.95 - Metric val = 0.94 - Metric test = 0.93


## Applying Oversampling and Subsampling

In [83]:
100*dat.groupby(['exitus'])['exitus'].agg(['count'])/dat.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,96.159726
1,3.840274


In [109]:
import numpy as np

train_data = dat[dat['dataset'] == 'train']
test_data = dat[dat['dataset'] == 'test']
val_data = dat[dat['dataset'] == 'val']

# Determine the minority class in the training data
minority_class_len = min(train_data['exitus'].value_counts().tolist())

# Subsample the majority class in the training data
majority_class_indices = train_data[train_data['exitus'] == 0].index
random_major_indices = np.random.choice(majority_class_indices, minority_class_len, replace=False)
random_major_indices = np.array(random_major_indices)

# Get the indices of the minority class in the training data
minority_class_indices = train_data[train_data['exitus'] == 1].index

# Concatenate the indices to create a balanced training dataset
under_sample_indices = np.concatenate([minority_class_indices, random_major_indices])
under_sample_train_data = train_data.loc[under_sample_indices]

# Now, let's concatenate train, validation, and test to create data_new
data_new = pd.concat([under_sample_train_data, val_data, test_data], axis=0)

In [85]:
data_new

Unnamed: 0,exitus,expected_length,mortality_ratio,dataset,age,num_proc,severity_2.0,severity_3.0,severity_4.0,severity_nan,...,origin_4.0,origin_6.0,origin_8.0,origin_9.0,origin_nan,tip_grd_Q,tip_grd_nan,tip_adm_2.0,tip_adm_3.0,tip_adm_nan
7,1,40.0,0.074278,train,21685.0,7.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
10,1,31.0,0.484536,train,31612.0,9.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13,1,14.0,0.231884,train,24755.0,14.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
40,1,31.0,0.250000,train,17226.0,19.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
42,1,10.0,0.090515,train,28992.0,9.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32701,0,2.0,0.028365,test,23619.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
32702,0,2.0,0.000606,test,3935.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32703,0,2.0,0.040452,test,30163.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
32704,0,0.0,0.000000,test,29012.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [110]:
100*data_new.groupby(['exitus'])['exitus'].agg(['count'])/data_new.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,89.155586
1,10.844414


In [116]:
# Oversampling
from imblearn.over_sampling import SMOTE
# Initializing SMOTE object
sm = SMOTE(sampling_strategy ={1: 80},
           random_state = 0,
           k_neighbors = 5)

# Performing oversampling
X_res, y_res = sm.fit_resample(data_new.drop(['exitus', 'dataset'], axis = 1), data_new['exitus'])

X_res['exitus'] = y_res

X_res['dataset'] = 'train'


# Create new dataset after SMOTE.

dat_smote = pd.concat([X_res, data_new[data_new['dataset'] == 'val'], data_new[data_new['dataset'] == 'test']])

# Checking the class distribution after SMOTE
100 * dat_smote.groupby(['exitus'])['exitus'].agg(['count']) / dat_smote.shape[0]

ValueError: With over-sampling methods, the number of samples in a class should be greater or equal to the original number of samples. Originally, there is 1256 samples and 80 samples are asked.

In [45]:
criterion_values = ['gini', 'entropy']
max_depth_values = [5, 6, 7]
min_samples_split_values = [10, 20, 30]
min_samples_leaf_values = [29, 30, 31]
max_features_values = [None, 1, 2]

params_grid = {  'criterion': criterion_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values}

In [46]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('criterion',
                                       'max_depth',
                                       'min_samples_split',
                                       'min_samples_leaf',
                                       'max_features',
                                       'auc_train',
                                       'auc_val',
                                       'time'))

for criterion in params_grid['criterion']:
    for max_depth in params_grid['max_depth']:
        for min_samples_split in params_grid['min_samples_split']:
            for min_samples_leaf in params_grid['min_samples_leaf']:
                for max_features in params_grid['max_features']:


                    # Start time
                    start_time = default_timer()

                    # Print trace
                    print('Iteracion = ' + str(num_iter))

                    # [3] Define model
                    model = model_constructor_1(criterion = criterion,
                                              max_depth = max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf,
                                              max_features = max_features,
                                              random_state = 0)

                    # [4] Train model
                    model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].exitus.values)

                    # [5] Predict
                    pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!
                    pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!

                    # [6] Evaluate
                    metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
                    metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])

                    # Computational time
                    time = default_timer() - start_time

                    # print error
                    print('AUC train = %.2f - AUC validation = %.2f. Time spend = %.2f.'
                          % (metric_train, metric_val, time))

                    # Save iteration results
                    grid_results.loc[num_iter]=[criterion,
                                                max_depth,
                                                min_samples_split,
                                                min_samples_leaf,
                                                max_features,
                                             metric_train,
                                             metric_val,
                                            time]
                    num_iter += 1

print('Grid Search Total Computational Time: ', np.sum(grid_results.time.values))

Iteracion = 1


KeyError: 'dataset'

In [38]:
grid_results = grid_results.sort_values(by = ['auc_val', 'auc_train', 'time'], ascending = [False, False, True])
grid_results

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,max_features,auc_train,auc_val,time
40,gini,6,20,30,,0.946581,0.935792,0.082877
31,gini,6,10,30,,0.946581,0.935792,0.084940
49,gini,6,30,30,,0.946581,0.935792,0.086850
34,gini,6,10,31,,0.946536,0.935474,0.083728
43,gini,6,20,31,,0.946536,0.935474,0.085062
...,...,...,...,...,...,...,...,...
41,gini,6,20,30,1,0.705386,0.681555,0.034423
50,gini,6,30,30,1,0.705386,0.681555,0.034950
47,gini,6,30,29,1,0.705386,0.681555,0.035046
53,gini,6,30,31,1,0.705386,0.681555,0.035978


In [39]:
best_model = grid_results.iloc[0]
best_model

criterion                gini
max_depth                   6
min_samples_split          20
min_samples_leaf           30
max_features             None
auc_train            0.946581
auc_val              0.935792
time                 0.082877
Name: 40, dtype: object

In [40]:
model  = model_constructor_1(criterion = best_model['criterion'],
                                              max_depth = best_model['max_depth'],
                                              min_samples_split = best_model['min_samples_split'],
                                              min_samples_leaf = best_model['min_samples_leaf'],
                                              max_features = best_model['max_features'],
                                              random_state = 0)

In [41]:
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)

In [42]:
# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [43]:
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.95 - Metric val = 0.94 - Metric test = 0.93


# Model 2 Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier as model_constructor_2

In [23]:
# Random Forest
n_estimators_values = [10, 100, 1000]
max_features_values = [2, 5, 10]
max_samples_values = [100, 1000, dat[dat['dataset'] == 'train'].shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}

In [24]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor_2(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
                                  dat[dat['dataset'] == 'train'].exitus.values)


                        # [5] Predict
                        pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
                        pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!

                        # [6] Compute metric
                        metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
                        metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1



Iteracion = 1
Metric train = 0.84 - Metric validation = 0.82.
Iteracion = 2
Metric train = 0.90 - Metric validation = 0.85.
Iteracion = 3
Metric train = 1.00 - Metric validation = 0.84.
Iteracion = 4
Metric train = 0.93 - Metric validation = 0.93.
Iteracion = 5
Metric train = 0.96 - Metric validation = 0.93.
Iteracion = 6
Metric train = 1.00 - Metric validation = 0.92.
Iteracion = 7
Metric train = 0.94 - Metric validation = 0.93.
Iteracion = 8
Metric train = 0.97 - Metric validation = 0.94.
Iteracion = 9
Metric train = 1.00 - Metric validation = 0.93.
Iteracion = 10
Metric train = 0.86 - Metric validation = 0.85.
Iteracion = 11
Metric train = 0.90 - Metric validation = 0.86.
Iteracion = 12
Metric train = 1.00 - Metric validation = 0.86.
Iteracion = 13
Metric train = 0.93 - Metric validation = 0.92.
Iteracion = 14
Metric train = 0.96 - Metric validation = 0.94.
Iteracion = 15
Metric train = 1.00 - Metric validation = 0.92.
Iteracion = 16
Metric train = 0.94 - Metric validation = 0.93.
I

In [25]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model = grid_results.iloc[0]
best_model

max_features      10.000000
n_estimators    1000.000000
max_samples     1000.000000
metric_train       0.961979
metric_val         0.937384
Name: 26, dtype: float64

In [26]:
# Extract variable importance (tree-based)
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)
var_imp = pd.DataFrame({'var': dat.drop(['exitus', 'dataset'], axis = 1).columns, 'imp': model.feature_importances_})
var_imp.sort_values(['imp'], ascending = False, inplace = True)
var_imp

Unnamed: 0,var,imp
1,mortality_ratio,0.3213752
0,age,0.2589571
3,expected_length,0.1476638
2,num_proc,0.1153729
18,tip_grd_nan,0.02751719
16,origin_nan,0.02720553
21,tip_adm_nan,0.0216028
6,severity_4.0,0.01794256
7,severity_nan,0.0148982
5,severity_3.0,0.01338546


In [27]:
dat = dat[['mortality_ratio', 'age', 'expected_length', 'num_proc', 'exitus', 'dataset']] # This is just a fake example of how to select the most important variables.

In [28]:
# [4] Train model
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [29]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 1.00 - Metric val = 0.92 - Metric test = 0.93


# Model 3 XGBoost

In [30]:
from xgboost import XGBClassifier as model_constructor_3

model = model_constructor_3(early_stopping_rounds=10,
                            n_estimators=1000,
                            eval_metric="auc",
                            random_state=1)

In [31]:
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1),
          np.array(dat[dat['dataset'] == 'train'].exitus.values),
          eval_set=[(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'val'].exitus.values)], 
          verbose=True,
          )

[0]	validation_0-auc:0.92578
[1]	validation_0-auc:0.92765
[2]	validation_0-auc:0.92970
[3]	validation_0-auc:0.93286
[4]	validation_0-auc:0.93248
[5]	validation_0-auc:0.93186
[6]	validation_0-auc:0.93142
[7]	validation_0-auc:0.93240
[8]	validation_0-auc:0.93261
[9]	validation_0-auc:0.93175
[10]	validation_0-auc:0.93247
[11]	validation_0-auc:0.93232
[12]	validation_0-auc:0.93241


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [32]:
pred_train_p = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1))
pred_val_p = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1))
pred_test_p = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1))
# Calcular métricas de evaluación
auc_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train_p[:,1])
auc_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val_p[:,1])
auc_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test_p[:,1])
results = pd.DataFrame()

new_data = pd.DataFrame(data={'model': ['XGBoost (Default)'], 'auc_train': [auc_train], 'auc_val': [auc_val], 'auc_test': [auc_test]}, columns=['model', 'auc_train', 'auc_val', 'auc_test'])

results = pd.concat([results, new_data], ignore_index=True)

results

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,model,auc_train,auc_val,auc_test
0,XGBoost (Default),0.954507,0.932856,0.934233


# Model 4 SVM

In [33]:
from sklearn.svm import SVC as model_constructor_4

In [34]:
d = dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1).shape[1]
m = np.mean(dat[dat['dataset'] == 'train'].exitus.values)
s = np.std(dat[dat['dataset'] == 'train'].exitus.values)

In [35]:
C_cherk = np.max([np.abs(m + 3*s),np.abs(m - 3*s)])
gamma_cherk = np.power(0.2, 1/d)

In [36]:
# SVM
C_values = [0.1, 1, 10]
gamma_values = [0.01, 1, 100]

params_grid = {'C': C_values,
               'gamma': gamma_values}

In [37]:
num_iter = 1
grid_results = pd.DataFrame(columns = ('C',
                                       'gamma',
                                       'auc_train',
                                       'auc_val',
                                       'time'))

for C in params_grid['C']:
    for gamma in params_grid['gamma']:

                    # Start time
                    start_time = default_timer()

                    # Print trace
                    print('Iteracion = ' + str(num_iter))

                    # [3] Define model
                    model = model_constructor_4(C = C,
                                              gamma = gamma,
                                              probability = True,
                                              random_state = 0) # Probability = True!!!

                    # [4] Train model
                    model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)

                    # [5] Predict
                    pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!
                    pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict_proba!

                    # [6] Compute metric
                    metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,0])
                    metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,0])

                    # Computational time
                    time = default_timer() - start_time

                    # print error
                    print('AUC train = %.2f - AUC validation = %.2f. Time spend = %.2f.'
                          % (metric_train, metric_val, time))

                    # Save iteration results
                    grid_results.loc[num_iter]=[C,
                                                gamma,
                                                metric_train,
                                                metric_val,
                                                time]
                    num_iter += 1

print('Grid Search Total Computational Time: ', np.sum(grid_results.time.values))

Iteracion = 1
AUC train = 0.03 - AUC validation = 0.45. Time spend = 171.62.
Iteracion = 2
AUC train = 0.00 - AUC validation = 0.33. Time spend = 532.83.
Iteracion = 3
AUC train = 0.00 - AUC validation = 0.49. Time spend = 594.49.
Iteracion = 4
AUC train = 0.03 - AUC validation = 0.45. Time spend = 300.82.
Iteracion = 5
AUC train = 0.00 - AUC validation = 0.32. Time spend = 714.31.
Iteracion = 6
AUC train = 0.00 - AUC validation = 0.49. Time spend = 784.88.
Iteracion = 7
AUC train = 0.98 - AUC validation = 0.51. Time spend = 283.18.
Iteracion = 8
AUC train = 0.00 - AUC validation = 0.32. Time spend = 747.88.
Iteracion = 9
AUC train = 0.00 - AUC validation = 0.49. Time spend = 757.84.
Grid Search Total Computational Time:  4887.841272299993


In [38]:
grid_results = grid_results.sort_values(by = ['auc_val', 'auc_train', 'time'], ascending = [False, False, True])
grid_results

Unnamed: 0,C,gamma,auc_train,auc_val,time
7,10.0,0.01,0.980731,0.508097,283.183016
9,10.0,100.0,0.0,0.488844,757.837903
6,1.0,100.0,0.0,0.48881,784.881326
3,0.1,100.0,0.0,0.488769,594.486062
4,1.0,0.01,0.031175,0.453393,300.823236
1,0.1,0.01,0.034346,0.452615,171.616194
2,0.1,1.0,0.0,0.326914,532.830954
5,1.0,1.0,0.0,0.324975,714.305631
8,10.0,1.0,0.0,0.324802,747.87695


In [39]:
best_model = grid_results.iloc[0]
best_model

C             10.000000
gamma          0.010000
auc_train      0.980731
auc_val        0.508097
time         283.183016
Name: 7, dtype: float64

In [40]:
print('Old train data size = ' + str(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1).shape))
print('Old train target size = ' + str(dat[dat['dataset'] == 'train'].exitus.values.shape))

# Combine train and validación
X_train = np.concatenate((dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)), axis = 0)
y_train = np.concatenate((dat[dat['dataset'] == 'train'].exitus.values, dat[dat['dataset'] == 'val'].exitus.values), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (22894, 4)
Old train target size = (22894,)
New train data size = (27799, 4)
New train target size = (27799,)


In [42]:
# [3] Define model
model = model_constructor_4(C = best_model.C,
                          gamma = best_model.gamma,
                          probability = True,
                          random_state = 0) # probability = True!!!

# [4] Train model
model.fit(X_train, y_train)

# [5] Predict
pred_train = model.predict_proba(X_train)
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1))

# [6] Compute metric
metric_train = metric(y_train, pred_train, multi_class = 'ovo')
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test,multi_class = 'ovo')



ValueError: y should be a 1d array, got an array of shape (27799, 2) instead.

In [None]:
# print error
print('AUC train = %.2f - AUC test = %.2f'
      % (metric_train, metric_test))