In [53]:
# Load libraries
import numpy as np
import pandas as pd

from timeit import default_timer

from sklearn.model_selection import train_test_split

# Random Forest Exercise

In this exercise you will have to implement code in the section *Fill with Your Code* to create a random forest to predict the target column of the Breast cancer dataset.

The code that is already written in this notebook **CANNOT BE CHANGED**. You can only add code in the *Fill with Your Code* section.

You must achieve in the last cell of this notebook an **F1-score over test of at least 0.35**.

## Key Hyperparameters

In [3]:
# [1] Import model
from sklearn.ensemble import RandomForestClassifier as model_constructor

## Data

You already know it is iris...

In [54]:
dat = pd.read_csv('../data/healthcare_missing.csv', sep = ",")
y = dat['exitus']
X = dat.drop(['exitus'], axis = 1)

In [7]:
dat

Unnamed: 0,date,severity,mortality_ratio,age,num_proc,ambulatory,origin,expected_length,tip_grd,tip_adm,exitus
0,2016-01,4,0.408730,12596,21,0,1,151,Q,1.0,0
1,2016-01,4,0.306931,20973,22,0,1,99,Q,1.0,0
2,2016-01,4,0.278481,19611,19,0,1,87,Q,1.0,0
3,2016-01,3,0.150289,13583,22,0,1,100,Q,1.0,0
4,2016-01,1,0.016573,18042,2,0,1,44,Q,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
32701,2016-12,2,0.028365,23619,2,0,1,2,M,1.0,0
32702,2016-12,1,0.000606,3935,1,0,1,2,M,1.0,0
32703,2016-12,2,0.040452,30163,4,0,1,2,M,1.0,0
32704,2016-12,2,0.000000,29012,4,0,1,0,M,1.0,0


In [11]:
dat.columns

Index(['date', 'severity', 'mortality_ratio', 'age', 'num_proc', 'ambulatory',
       'origin', 'expected_length', 'tip_grd', 'tip_adm', 'exitus'],
      dtype='object')

## Fill with Your Code

In [37]:
# One hot encoding
from sklearn.preprocessing import OneHotEncoder
dat['origin'].value_counts()

origin
1.0    9095
2.0    2268
4.0     230
8.0     208
3.0     185
6.0     165
9.0      95
Name: count, dtype: int64

In [55]:
categorical_vars = ['date', 'severity', 'origin', 'ambulatory', 'tip_grd', 'tip_adm']
non_categorical_vars = list(set(X.columns) - set(categorical_vars))
numerical_variables = list(set(dat.columns) - set(categorical_vars) - {'dataset', 'date', 'exitus'})
print(categorical_vars)
print(numerical_variables)

['date', 'severity', 'origin', 'ambulatory', 'tip_grd', 'tip_adm']
['age', 'mortality_ratio', 'num_proc', 'expected_length']


In [56]:
from sklearn.preprocessing import OneHotEncoder
# 3 defining the model
ohe = OneHotEncoder(sparse_output = False, drop='first')

# 4) Training model
ohe.fit(X[categorical_vars])

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(X[categorical_vars]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()

# Combine numerical and categorical
dat = pd.concat((X[non_categorical_vars], dat_ohe), axis=1)

In [48]:
# Fill missing values
dat.isna().any()

age                 True
mortality_ratio     True
num_proc            True
expected_length     True
date_2016-02       False
date_2016-03       False
date_2016-04       False
date_2016-05       False
date_2016-06       False
date_2016-07       False
date_2016-08       False
date_2016-09       False
date_2016-10       False
date_2016-11       False
date_2016-12       False
severity_2.0       False
severity_3.0       False
severity_4.0       False
severity_nan       False
origin_2.0         False
origin_3.0         False
origin_4.0         False
origin_6.0         False
origin_8.0         False
origin_9.0         False
origin_nan         False
ambulatory_1.0     False
ambulatory_nan     False
tip_grd_Q          False
tip_grd_nan        False
tip_adm_2.0        False
tip_adm_3.0        False
tip_adm_nan        False
dtype: bool

In [57]:
from fancyimpute import IterativeImputer as MICE # pip install fancyimpute
numerical_vars = list(set(dat.columns) - {'exitus', 'dataset'})
model = MICE()
dat[numerical_vars] = model.fit_transform(dat[numerical_vars])

In [50]:
dat.isna().any()

age                False
mortality_ratio    False
num_proc           False
expected_length    False
date_2016-02       False
date_2016-03       False
date_2016-04       False
date_2016-05       False
date_2016-06       False
date_2016-07       False
date_2016-08       False
date_2016-09       False
date_2016-10       False
date_2016-11       False
date_2016-12       False
severity_2.0       False
severity_3.0       False
severity_4.0       False
severity_nan       False
origin_2.0         False
origin_3.0         False
origin_4.0         False
origin_6.0         False
origin_8.0         False
origin_9.0         False
origin_nan         False
ambulatory_1.0     False
ambulatory_nan     False
tip_grd_Q          False
tip_grd_nan        False
tip_adm_2.0        False
tip_adm_3.0        False
tip_adm_nan        False
dtype: bool

In [58]:
perc_values = [0.7, 0.15, 0.15]
X_train, X_valtest, y_train, y_valtest = train_test_split(dat, y, stratify = y, test_size=perc_values[1] + perc_values[2], random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, stratify = y_valtest, test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

In [59]:
from sklearn.ensemble import RandomForestClassifier as model_constructor
from sklearn.metrics import f1_score as metric

# Random Forest
n_estimators_values = [10, 100, 1000]
max_features_values = [2, 5, 10, 50, 100]
max_samples_values = [10,100, 1000, X_train.shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}

num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(X_train, y_train)


                        # [5] Predict
                        pred_train = model.predict(X_train) # predict!
                        pred_val = model.predict(X_val) # predict!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train)
                        metric_val = metric(y_val, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1




Iteracion = 1
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 2
Metric train = 0.02 - Metric validation = 0.02.
Iteracion = 3
Metric train = 0.08 - Metric validation = 0.08.
Iteracion = 4
Metric train = 0.92 - Metric validation = 0.21.
Iteracion = 5
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 6
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 7
Metric train = 0.02 - Metric validation = 0.01.
Iteracion = 8
Metric train = 1.00 - Metric validation = 0.20.
Iteracion = 9
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 10
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 11
Metric train = 0.01 - Metric validation = 0.01.
Iteracion = 12
Metric train = 1.00 - Metric validation = 0.20.
Iteracion = 13
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 14
Metric train = 0.01 - Metric validation = 0.01.
Iteracion = 15
Metric train = 0.20 - Metric validation = 0.20.
Iteracion = 16
Metric train = 0.94 - Metric validation = 0.30.
I

In [60]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, True])
best_model = grid_results.iloc[0]
best_model

max_features       50.000000
n_estimators     1000.000000
max_samples     22894.000000
metric_train        1.000000
metric_val          0.345865
Name: 48, dtype: float64

## Final Model

Validation has served its purpose, let's combine it with train to get more training data.

In [62]:
print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (27800, 33)
Old train target size = (27800,)
New train data size = (32706, 33)
New train target size = (32706,)


In [65]:
# [3] Define model
model = model_constructor(criterion = 'gini',
                          max_depth = None,
                          min_samples_split = 2,
                          min_samples_leaf = 1,
                          max_features = int(best_model.max_features),
                          n_estimators =  int(best_model.n_estimators),
                          max_samples = int(best_model.max_samples),
                          random_state = 0) # Use same random_state as in training!!!

# [4] Train model
model.fit(X_train, y_train)

# [5] Predict
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

# [6] Compute metric
metric_train = metric(y_train, pred_train)
metric_test = metric(y_test, pred_test)





In [66]:
# print error
print('AUC train = %.2f - AUC test = %.2f'
      % (metric_train, metric_test))

AUC train = 0.99 - AUC test = 0.28
