In [132]:
# Load libraries
import numpy as np
import pandas as pd
import io

#  Mock Midterm Exercise

In this exercise you will have to implement code in the sections inside *Fill with Your Code* (*Load Data*, *Data Preprocessing* and *Create Model*) create a model to predict the column *exitus* in the dataset given. The dataset is already split into train, validation, and test subsets. To see to which subset belong each observation, you need to check the *dataset* column.

The code that is already written in this notebook **CANNOT BE CHANGED**. You can only add code in the *Fill with Your Code* section.

You must achieve in the last cell of this notebook an **AUC over test of at least 0.93**.

# Fill With Your Code

### [1] Load Data

In [133]:
dat = pd.read_csv('data/dat.csv', sep = ",")

In [134]:
dat

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus,dataset
0,2016-01,,0.408730,12596.0,21.0,0.0,,151.0,Q,1.0,0,train
1,2016-01,,0.306931,20973.0,22.0,,,99.0,Q,1.0,0,train
2,2016-01,4.0,0.278481,19611.0,19.0,,,87.0,,1.0,0,train
3,2016-01,3.0,0.150289,13583.0,22.0,,,100.0,Q,,0,train
4,2016-01,1.0,0.016573,18042.0,2.0,,,44.0,Q,1.0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...
32701,2016-12,2.0,0.028365,23619.0,2.0,,,2.0,,1.0,0,test
32702,2016-12,1.0,0.000606,3935.0,1.0,,1.0,2.0,M,1.0,0,test
32703,2016-12,,0.040452,30163.0,4.0,,,2.0,M,,0,test
32704,2016-12,,0.000000,29012.0,4.0,,,0.0,,1.0,0,test


### [2]  Data Preprocessing

In [135]:
dat['severity'].value_counts()

severity
1.0    7721
2.0    1912
3.0    1767
4.0     672
Name: count, dtype: int64

In [136]:
dat['origin'].value_counts()

origin
1.0    9095
2.0    2268
4.0     230
8.0     208
3.0     185
6.0     165
9.0      95
Name: count, dtype: int64

In [137]:
categorical_variables = ['severity', 'ambulatory', 'origin', 'tip_grd', 'tip_adm']
non_categorical_variables = list(set(dat.columns) - set(categorical_variables))
numerical_variables = list(set(dat.columns) - set(categorical_variables) - {'dataset', 'date', 'exitus'})
print(categorical_variables)
print(numerical_variables)

['severity', 'ambulatory', 'origin', 'tip_grd', 'tip_adm']
['mortality_ratio', 'age', 'num_proc', 'expected_length']


In [138]:
# Missing values
dat.apply(lambda x: 100*np.sum(x.isna())/len(x))

# The percentage is high, so I am not going to remove the rows

date                0.000000
severity           63.089341
mortality_ratio     6.934507
age                 1.296398
num_proc            4.127683
ambulatory         94.010273
origin             62.557329
expected_length     5.726778
tip_grd            41.631505
tip_adm            18.617379
exitus              0.000000
dataset             0.000000
dtype: float64

In [139]:
dat[categorical_variables] = dat[categorical_variables].fillna('UNKNOWN')

In [140]:
dat.apply(lambda x: 100*np.sum(x.isna())/len(x))

date               0.000000
severity           0.000000
mortality_ratio    6.934507
age                1.296398
num_proc           4.127683
ambulatory         0.000000
origin             0.000000
expected_length    5.726778
tip_grd            0.000000
tip_adm            0.000000
exitus             0.000000
dataset            0.000000
dtype: float64

In [141]:
dat[numerical_variables] = dat[numerical_variables].apply(lambda x: x.fillna(x.median()), axis=0)

In [142]:
dat.apply(lambda x: 100*np.sum(x.isna())/len(x))

date               0.0
severity           0.0
mortality_ratio    0.0
age                0.0
num_proc           0.0
ambulatory         0.0
origin             0.0
expected_length    0.0
tip_grd            0.0
tip_adm            0.0
exitus             0.0
dataset            0.0
dtype: float64

In [143]:
dat[categorical_variables] = dat[categorical_variables].astype(str)

In [144]:
from sklearn.preprocessing import OneHotEncoder
# 3 defining the model
ohe = OneHotEncoder(sparse_output = False, drop = 'first')

# 4) Training model
ohe.fit(dat[categorical_variables])

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(dat[categorical_variables]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()

# Combine numerical and categorical
dat = pd.concat((dat[non_categorical_variables], dat_ohe), axis=1)

In [145]:
dat

Unnamed: 0,age,dataset,exitus,date,expected_length,num_proc,mortality_ratio,severity_2.0,severity_3.0,severity_4.0,...,origin_4.0,origin_6.0,origin_8.0,origin_9.0,origin_UNKNOWN,tip_grd_Q,tip_grd_UNKNOWN,tip_adm_2.0,tip_adm_3.0,tip_adm_UNKNOWN
0,12596.0,train,0,2016-01,151.0,21.0,0.408730,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,20973.0,train,0,2016-01,99.0,22.0,0.306931,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,19611.0,train,0,2016-01,87.0,19.0,0.278481,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,13583.0,train,0,2016-01,100.0,22.0,0.150289,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,18042.0,train,0,2016-01,44.0,2.0,0.016573,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32701,23619.0,test,0,2016-12,2.0,2.0,0.028365,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
32702,3935.0,test,0,2016-12,2.0,1.0,0.000606,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32703,30163.0,test,0,2016-12,2.0,4.0,0.040452,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
32704,29012.0,test,0,2016-12,0.0,4.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [146]:
100*dat.groupby(['exitus'])['exitus'].agg(['count'])/dat.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,96.159726
1,3.840274


In [147]:
# Oversampling
from imblearn.over_sampling import SMOTE
# Initializing SMOTE object
sm = SMOTE(sampling_strategy = 1.0,
           random_state = 0,
           k_neighbors = 5)

# Performing oversampling
X_res, y_res = sm.fit_resample(dat.drop(['exitus', 'date', 'dataset'], axis = 1), dat['exitus'])

# Concatenating the oversampled features and labels into a new DataFrame
dat = pd.concat([X_res, y_res, dat['dataset'], dat['date']], axis = 1)

# Checking the class distribution after SMOTE
100 * dat.groupby(['exitus'])['exitus'].agg(['count']) / dat.shape[0]

# only with train

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,50.0
1,50.0


In [148]:
dat

Unnamed: 0,age,expected_length,num_proc,mortality_ratio,severity_2.0,severity_3.0,severity_4.0,severity_UNKNOWN,ambulatory_1.0,ambulatory_UNKNOWN,...,origin_9.0,origin_UNKNOWN,tip_grd_Q,tip_grd_UNKNOWN,tip_adm_2.0,tip_adm_3.0,tip_adm_UNKNOWN,exitus,dataset,date
0,12596.000000,151.000000,21.000000,0.408730,0.000000,0.000000,0.000000,1.000000,0.0,0.0,...,0.0,1.000000,1.000000,0.000000,0.0,0.0,0.000000,0,train,2016-01
1,20973.000000,99.000000,22.000000,0.306931,0.000000,0.000000,0.000000,1.000000,0.0,1.0,...,0.0,1.000000,1.000000,0.000000,0.0,0.0,0.000000,0,train,2016-01
2,19611.000000,87.000000,19.000000,0.278481,0.000000,0.000000,1.000000,0.000000,0.0,1.0,...,0.0,1.000000,0.000000,1.000000,0.0,0.0,0.000000,0,train,2016-01
3,13583.000000,100.000000,22.000000,0.150289,0.000000,1.000000,0.000000,0.000000,0.0,1.0,...,0.0,1.000000,1.000000,0.000000,0.0,0.0,1.000000,0,train,2016-01
4,18042.000000,44.000000,2.000000,0.016573,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.0,1.000000,1.000000,0.000000,0.0,0.0,0.000000,0,train,2016-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62895,22177.756113,13.554858,8.621943,0.091532,0.770324,0.229676,0.000000,0.000000,0.0,1.0,...,0.0,0.229676,0.000000,0.229676,0.0,0.0,0.000000,1,,
62896,25076.828341,5.935628,5.978543,0.532906,0.000000,0.000000,0.978543,0.021457,0.0,1.0,...,0.0,0.021457,0.000000,0.000000,0.0,0.0,0.978543,1,,
62897,31820.349352,13.280959,7.116451,0.318403,0.000000,0.116451,0.883549,0.000000,0.0,1.0,...,0.0,1.000000,0.883549,0.116451,0.0,0.0,0.116451,1,,
62898,33433.609957,7.804978,4.451245,0.041255,0.000000,0.451245,0.000000,0.548755,0.0,1.0,...,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.548755,1,,


In [149]:
y = dat['exitus']
X = dat.drop(['exitus'], axis=1)

In [150]:
dat['dataset'].value_counts()

dataset
train    22894
test      4907
val       4905
Name: count, dtype: int64

In [151]:
X_val = dat[dat['dataset'] == 'val'].drop(['dataset', 'date'], axis=1)
y_val = dat[dat['dataset'] == 'val']['exitus']
X_train = dat[dat['dataset'] == 'train'].drop(['dataset', 'date'], axis=1)
y_train = dat[dat['dataset'] == 'train']['exitus']
X_test = dat[dat['dataset'] == 'test'].drop(['dataset', 'date'], axis=1)
y_test = dat[dat['dataset'] == 'test']['exitus']

### [3] Create Model

In [152]:
from sklearn.ensemble import RandomForestClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

# Random Forest
n_estimators_values = [10, 100, 1000]
max_features_values = [2, 5, 10]
max_samples_values = [100, 1000, X_train.shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}

num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(X_train, y_train)


                        # [5] Predict
                        pred_train = model.predict(X_train) # predict!
                        pred_val = model.predict(X_val) # predict!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train)
                        metric_val = metric(y_val, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1


Iteracion = 1
Metric train = 0.63 - Metric validation = 0.65.
Iteracion = 2
Metric train = 0.96 - Metric validation = 0.95.
Iteracion = 3
Metric train = 1.00 - Metric validation = 0.99.
Iteracion = 4
Metric train = 0.67 - Metric validation = 0.64.
Iteracion = 5
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 6
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 7
Metric train = 0.61 - Metric validation = 0.58.
Iteracion = 8
Metric train = 1.00 - Metric validation = 0.99.
Iteracion = 9
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 10
Metric train = 0.90 - Metric validation = 0.88.
Iteracion = 11
Metric train = 1.00 - Metric validation = 0.99.
Iteracion = 12
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 13
Metric train = 0.98 - Metric validation = 0.99.
Iteracion = 14
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 15
Metric train = 1.00 - Metric validation = 1.00.
Iteracion = 16
Metric train = 0.97 - Metric validation = 0.96.
I

In [153]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, True])
best_model = grid_results.iloc[0]
best_model

max_features       2.00000
n_estimators     100.00000
max_samples     1000.00000
metric_train       0.99774
metric_val         1.00000
Name: 5, dtype: float64

In [154]:
print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (22894, 23)
Old train target size = (22894,)
New train data size = (27799, 23)
New train target size = (27799,)


In [155]:
model =  model_constructor(criterion = 'gini',
                          max_depth = None,
                          min_samples_split = 2,
                          min_samples_leaf = 1,
                          max_features = int(best_model.max_features),
                          n_estimators =  int(best_model.n_estimators),
                          max_samples = int(best_model.max_samples),
                          random_state = 0) # Use same random_state as in training!!!

In [156]:
dat.drop('date', axis=1, inplace=True)

# Evaluate Model

In [157]:
# [4] Train model
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [158]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.96 - Metric val = 0.93 - Metric test = 0.93
