In [32]:
# Load libraries
import numpy as np
import pandas as pd
import io

#  Mock Midterm Exercise

In this exercise you will have to implement code in the sections inside *Fill with Your Code* (*Load Data*, *Data Preprocessing* and *Create Model*) create a model to predict the column *exitus* in the dataset given. The dataset is already split into train, validation, and test subsets. To see to which subset belong each observation, you need to check the *dataset* column.

The code that is already written in this notebook **CANNOT BE CHANGED**. You can only add code in the *Fill with Your Code* section.

You must achieve in the last cell of this notebook an **AUC over test of at least 0.93**.

# Fill With Your Code

### [1] Load Data

In [33]:
dat = pd.read_csv('../data/healthcare_missing.csv', sep = ",")

In [34]:
dat

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus
0,2016-01,,0.408730,12596.0,21.0,0.0,,151.0,Q,1.0,0
1,2016-01,,0.306931,20973.0,22.0,,,99.0,Q,1.0,0
2,2016-01,4.0,0.278481,19611.0,19.0,,,87.0,,1.0,0
3,2016-01,3.0,0.150289,13583.0,22.0,,,100.0,Q,,0
4,2016-01,1.0,0.016573,18042.0,2.0,,,44.0,Q,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
32701,2016-12,2.0,0.028365,23619.0,2.0,,,2.0,,1.0,0
32702,2016-12,1.0,0.000606,3935.0,1.0,,1.0,2.0,M,1.0,0
32703,2016-12,,0.040452,30163.0,4.0,,,2.0,M,,0
32704,2016-12,,0.000000,29012.0,4.0,,,0.0,,1.0,0


In [38]:
y = dat['exitus']
X = dat.drop(['exitus', 'date'], axis = 1)
X

Unnamed: 0,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm
0,,0.408730,12596.0,21.0,0.0,,151.0,Q,1.0
1,,0.306931,20973.0,22.0,,,99.0,Q,1.0
2,4.0,0.278481,19611.0,19.0,,,87.0,,1.0
3,3.0,0.150289,13583.0,22.0,,,100.0,Q,
4,1.0,0.016573,18042.0,2.0,,,44.0,Q,1.0
...,...,...,...,...,...,...,...,...,...
32701,2.0,0.028365,23619.0,2.0,,,2.0,,1.0
32702,1.0,0.000606,3935.0,1.0,,1.0,2.0,M,1.0
32703,,0.040452,30163.0,4.0,,,2.0,M,
32704,,0.000000,29012.0,4.0,,,0.0,,1.0


### [2]  Data Preprocessing

In [37]:
from scipy import stats
from imblearn.over_sampling import SMOTE

def oversampling(X, y, p, type_value = 'SMOTE', k = 5, random_state = 0, n_jobs = 1):
    if type_value == 'SMOTE':
        sm = SMOTE(sampling_strategy = p,
           random_state = random_state,
           k_neighbors = k)
        X, y = sm.fit_resample(X, y)
        return X, y
    elif type_value == 'duplicates':
        majority_class = stats.mode(y, keepdims = False)[0]
        oversampling_rows = pd.concat([pd.DataFrame(X[y != majority_class]),
                                       pd.Series(y[y != majority_class])],
                                      axis = 1).sample(frac = p,
                                                                      replace = True,
                                                                      random_state = random_state,
                                                                      axis = 0)
        X = pd.concat([pd.concat([pd.DataFrame(X), pd.Series(y)], axis = 1), oversampling_rows])
        y = X.iloc[:, X.shape[1]-1]
        return X, y

In [None]:
X_new, y_new = oversampling(X, y, p = 0.25)

In [23]:
dat.apply(lambda x: 100*np.sum(x.isna())/len(x))

date                0.000000
severity           63.089341
mortality_ratio     6.934507
age                 1.296398
num_proc            4.127683
ambulatory         94.010273
origin             62.557329
expected_length     5.726778
tip_grd            41.631505
tip_adm            18.617379
exitus              0.000000
dtype: float64

In [24]:
from fancyimpute import IterativeImputer as MICE
def fill_missing_values(X, num_method = 'mean', cat_method = 'new_category',
                estimator = None):
    categorical_variables = X.columns.values[dat.dtypes == 'object'].tolist()
    numerical_variables = X.columns.values[dat.dtypes == 'float64'].tolist()
    if num_method == 'mean':
        means = dat[numerical_variables].apply(lambda x: np.mean(x)).to_dict()
        X = X.fillna(value = means, axis = 0)

    elif num_method == 'mice':
        if estimator is None:
            X[numerical_variables] = MICE().fit_transform(X[numerical_variables])
        else:
            X[numerical_variables] = MICE(estimator = estimator).fit_transform(X[numerical_variables])
    if cat_method == 'new_category':
       X[categorical_variables] = X[categorical_variables].fillna(value = 'UNKNOWN', axis = 0)


    elif cat_method == 'mode':
        means = X[categorical_variables].apply(lambda x: np.mean(x)).to_dict()
        X = X.fillna(value = means, axis = 0)

    return X

In [25]:
dat = fill_missing_values(dat, 'mean', 'new_category')
dat

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus
0,2016-01,1.618125,0.408730,12596.0,21.0,0.000000,1.520088,151.0,Q,1.000000,0
1,2016-01,1.618125,0.306931,20973.0,22.0,0.069934,1.520088,99.0,Q,1.000000,0
2,2016-01,4.000000,0.278481,19611.0,19.0,0.069934,1.520088,87.0,UNKNOWN,1.000000,0
3,2016-01,3.000000,0.150289,13583.0,22.0,0.069934,1.520088,100.0,Q,1.271931,0
4,2016-01,1.000000,0.016573,18042.0,2.0,0.069934,1.520088,44.0,Q,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...
32701,2016-12,2.000000,0.028365,23619.0,2.0,0.069934,1.520088,2.0,UNKNOWN,1.000000,0
32702,2016-12,1.000000,0.000606,3935.0,1.0,0.069934,1.000000,2.0,M,1.000000,0
32703,2016-12,1.618125,0.040452,30163.0,4.0,0.069934,1.520088,2.0,M,1.271931,0
32704,2016-12,1.618125,0.000000,29012.0,4.0,0.069934,1.520088,0.0,UNKNOWN,1.000000,0


In [26]:
# Oversampling
100*dat.groupby(['exitus'])['exitus'].agg(['count'])/dat.shape[0]

Unnamed: 0_level_0,count
exitus,Unnamed: 1_level_1
0,96.159726
1,3.840274


In [27]:
# As we can see, it is an unbalance problem
dat['exitus'].value_counts()

exitus
0    31450
1     1256
Name: count, dtype: int64

### [3] Create Model

In [None]:
FILL

In [None]:
model =  FILL

# Evaluate Model

In [None]:
# [4] Train model
model.fit(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1), dat[dat['dataset'] == 'train'].exitus.values)


# [5] Predict
pred_train = model.predict_proba(dat[dat['dataset'] == 'train'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_val = model.predict_proba(dat[dat['dataset'] == 'val'].drop(['exitus', 'dataset'], axis = 1)) # predict!
pred_test = model.predict_proba(dat[dat['dataset'] == 'test'].drop(['exitus', 'dataset'], axis = 1)) # predict!


# [6] Compute metric
metric_train = metric(dat[dat['dataset'] == 'train'].exitus.values, pred_train[:,1])
metric_val = metric(dat[dat['dataset'] == 'val'].exitus.values, pred_val[:,1])
metric_test = metric(dat[dat['dataset'] == 'test'].exitus.values, pred_test[:,1])

In [None]:
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (metric_train, metric_val, metric_test))

Metric train = 0.96 - Metric val = 0.94 - Metric test = 0.93
