In [1]:
# Load libraries
import numpy as np;
import pandas as pd

from timeit import default_timer

from sklearn.model_selection import train_test_split

# Random Forest Exercise

In this exercise you will have to implement code in the section *Fill with Your Code* to create a random forest to predict the target column of the Breast cancer dataset.

The code that is already written in this notebook **CANNOT BE CHANGED**. You can only add code in the *Fill with Your Code* section.

You must achieve in the last cell of this notebook an **F1-score over test of at least 0.35**.

## Key Hyperparameters

In [2]:
# [1] Import model
from sklearn.ensemble import RandomForestClassifier as model_constructor
?model_constructor

## Data

In [3]:
from google.colab import files
uploaded = files.upload()

Saving healthcare.csv to healthcare.csv


In [12]:
import io
dat = pd.read_csv(io.BytesIO(uploaded['healthcare.csv']), sep = ";")
y = dat['exitus']
X = dat.drop(['exitus', 'date'], axis = 1)

# Fill with Your Code

In [13]:
from sklearn.preprocessing import OneHotEncoder

### One Hot Encoding
categorical_vars = ['severity', 'origin', 'ambulatory', 'tip_grd', 'tip_adm']
non_categorical_vars = list(set(X.columns) - set(categorical_vars))

# 3 defining the model
ohe = OneHotEncoder(sparse_output = False)

# 4) Training model
ohe.fit(X)

# 5) Predicting
dat_ohe = pd.DataFrame(ohe.fit_transform(X[categorical_vars]))

# Optional
dat_ohe.columns = ohe.get_feature_names_out()

# Combine numerical and categorical
dat = pd.concat((X[non_categorical_vars], dat_ohe), axis=1)

### Fill missing values
!pip install fancyimpute
from fancyimpute import IterativeImputer as MICE # pip install fancyimpute
numerical_vars = list(set(dat.columns) - set(['exitus', 'dataset']))
model = MICE()
dat[numerical_vars] = model.fit_transform(dat[numerical_vars])



In [17]:
perc_values = [0.7, 0.15, 0.15];
X_train, X_valtest, y_train, y_valtest = train_test_split(dat, y, stratify = y, test_size=perc_values[1] + perc_values[2], random_state=1);
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, stratify = y_valtest, test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

In [18]:
from sklearn.ensemble import RandomForestClassifier as model_constructor
from sklearn.metrics import f1_score as metric

# Random Forest
n_estimators_values = [10, 100, 1000];
max_features_values = [2, 5, 10]
max_samples_values = [100, 1000, X_train.shape[0]]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}

num_iter = 1;
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(X_train, y_train)


                        # [5] Predict
                        pred_train = model.predict(X_train) # predict!
                        pred_val = model.predict(X_val) # predict!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train)
                        metric_val = metric(y_val, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1




Iteracion = 1
Metric train = 0.01 - Metric validation = 0.00.
Iteracion = 2
Metric train = 0.20 - Metric validation = 0.21.
Iteracion = 3
Metric train = 0.93 - Metric validation = 0.34.
Iteracion = 4
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 5
Metric train = 0.23 - Metric validation = 0.26.
Iteracion = 6
Metric train = 1.00 - Metric validation = 0.35.
Iteracion = 7
Metric train = 0.00 - Metric validation = 0.00.
Iteracion = 8
Metric train = 0.23 - Metric validation = 0.25.
Iteracion = 9
Metric train = 1.00 - Metric validation = 0.35.
Iteracion = 10
Metric train = 0.08 - Metric validation = 0.11.
Iteracion = 11
Metric train = 0.22 - Metric validation = 0.25.
Iteracion = 12
Metric train = 0.94 - Metric validation = 0.33.
Iteracion = 13
Metric train = 0.01 - Metric validation = 0.02.
Iteracion = 14
Metric train = 0.23 - Metric validation = 0.25.
Iteracion = 15
Metric train = 1.00 - Metric validation = 0.36.
Iteracion = 16
Metric train = 0.00 - Metric validation = 0.01.
I

In [42]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, True])
best_model = grid_results.iloc[0]
best_model

max_features      10.000000
n_estimators    1000.000000
max_samples     1000.000000
metric_train       0.267299
metric_val         0.291845
Name: 26, dtype: float64

## Final Model

Validation has served its purpose, let's combine it with train to get more training data.

In [28]:
print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (22894, 25)
Old train target size = (22894,)
New train data size = (27800, 25)
New train target size = (27800,)


In [43]:
# [3] Define model
model = model_constructor(criterion = 'gini',
                          max_depth = None,
                          min_samples_split = 2,
                          min_samples_leaf = 1,
                          max_features = int(best_model.max_features),
                          n_estimators =  int(best_model.n_estimators),
                          max_samples = int(best_model.max_samples),
                          random_state = 0) # Use same random_state as in training!!!

# [4] Train model
model.fit(X_train, y_train)

# [5] Predict
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

# [6] Compute metric
metric_train = metric(y_train, pred_train)
metric_test = metric(y_test, pred_test)





In [None]:
# print error
print('AUC train = %.2f - AUC test = %.2f'
      % (metric_train, metric_test))