In [1]:
import sys
import logging

import numpy as np
import scipy as sp

%reload_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


def get_features(df):
    df.drop("Time", axis=1, inplace=True)
    df_age = df.groupby("pid", sort=False).first()["Age"]
    df_mean = df.groupby("pid", sort=False).mean().add_suffix("_mean")
    df_var = df.groupby("pid", sort=False).var().add_suffix("_var")
    df_min = df.groupby("pid", sort=False).min().add_suffix("_min")
    df_max = df.groupby("pid", sort=False).max().add_suffix("_max")
    df_n_measurements = (
        df.groupby("pid", sort=False).count().add_suffix("_n_meas")
    )  # Number of measurements
    df_last_measurement = (
        df.groupby("pid", sort=False)
        .agg(
            lambda x: x[x.last_valid_index()]
            if x.last_valid_index() is not None
            else np.nan
        )
        .add_suffix("_last_measurement")
    )
    df_diff = df.groupby("pid", sort=False).agg(
        lambda x: x[x.last_valid_index()] - x[x.first_valid_index()]
        if (
            x.last_valid_index() is not None
            or x.last_valid_index() != x.first_valid_index()
        )
        else np.nan
    )  # Difference between first and last measurement
    df_diff_by_time = df.groupby("pid", sort=False).agg(
        lambda x: (x[x.last_valid_index()] - x[x.first_valid_index()])
        / (x.last_valid_index() - x.first_valid_index())
        if (
            x.last_valid_index() is not None
            and x.last_valid_index() != x.first_valid_index()
        )
        else np.nan
    )  # Difference between first and last measurement divided by time difference

    return pd.concat(
        [
            df_age,
            df_mean,
            df_var,
            df_min,
            df_max,
            df_n_measurements,
            df_last_measurement,
            df_diff,
            df_diff_by_time,
        ],
        axis=1,
    )



# get data
df_train = pd.read_csv("../train_features.csv")
df_training_labels = pd.read_csv("../train_labels.csv")
df_test = pd.read_csv("../test_features.csv")

# get features
df_train_features = get_features(df_train)
df_test_features = get_features(df_test)

# impute missing values
imputer = SimpleImputer(strategy="median")
imputer.fit(df_train_features)
X_train = imputer.transform(df_train_features)
X_test = imputer.transform(df_test_features)

# center and descale
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# prepare df for predictions
df_pred = pd.DataFrame(columns=df_training_labels.columns)
df_pred["pid"] = df_test_features.index


In [5]:
VITALS = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]
TESTS = [
    "LABEL_BaseExcess",
    "LABEL_Fibrinogen",
    "LABEL_AST",
    "LABEL_Alkalinephos",
    "LABEL_Bilirubin_total",
    "LABEL_Lactate",
    "LABEL_TroponinI",
    "LABEL_SaO2",
    "LABEL_Bilirubin_direct",
    "LABEL_EtCO2",
]

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm import tqdm

params = {
    "loss": "binary_crossentropy",
    "early_stopping": True,
    "scoring": "roc_auc",
}

hgbc = HistGradientBoostingClassifier(**params)

param_grid = {
    "learning_rate": [0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 5, None],
    "l2_regularization": [0, 0.01, 0.1],
}

optimal_parameters_tests = dict.fromkeys(TESTS)

for target in tqdm(optimal_parameters_tests.keys()):
    search = GridSearchCV(hgbc, param_grid, scoring="roc_auc", refit=False, n_jobs=-1, verbose=1)
    search.fit(X_train, df_training_labels[target])
    optimal_parameters_tests[target] = search.best_score_, search.best_params_


  0%|          | 0/10 [00:00<?, ?it/s]

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   8.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  10.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  12.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  18.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  19.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  23.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  26.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=  10.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  26.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  26.2s
[CV] END l2_regularization=0, lea

 10%|█         | 1/10 [03:55<35:19, 235.45s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   5.4s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   5.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   5.7s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   5.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   6.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   6.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   3.9s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  10.5s
[CV] END l2_regularization=0,

 20%|██        | 2/10 [06:13<23:45, 178.13s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   2.9s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   2.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   3.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  13.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  13.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  13.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  14.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  15.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  15.7s
[CV] END l2_regularization=0, le

 30%|███       | 3/10 [09:51<22:54, 196.42s/it]

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   9.7s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  11.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  15.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  15.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  15.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  15.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  15.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  20.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   6.8s
[CV] END l2_regularization=0, learning_rate=0.1, max_depth=3; total time=   5.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  15.9s
[CV] END l2_regularization=0, lear

 40%|████      | 4/10 [14:07<21:58, 219.80s/it]

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   6.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  15.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  16.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  16.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  11.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=  10.7s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  18.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  16.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  20.8s
[CV] END l2_regularization=0, lea

 50%|█████     | 5/10 [17:20<17:31, 210.23s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   4.7s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.9s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   7.7s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   9.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   5.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  10.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  14.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   4.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  14.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  17.6s
[CV] END l2_regularization=0,

 60%|██████    | 6/10 [20:15<13:12, 198.14s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   5.1s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   9.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   9.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  11.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  11.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  13.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  10.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   8.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  19.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   7.6s
[CV] END l2_regularization

 70%|███████   | 7/10 [22:39<09:01, 180.49s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   4.9s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   6.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   7.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  16.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  16.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=  10.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  17.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=  11.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=  19.4s
[CV] END l2_regularization=0,

 80%|████████  | 8/10 [26:38<06:38, 199.24s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   5.3s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   3.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   3.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.2s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   5.9s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   4.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   9.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   5.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   4.1s
[CV] END l2_regularization

 90%|█████████ | 9/10 [29:00<03:01, 181.27s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   5.5s
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   4.0s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   5.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   5.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   6.5s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   7.4s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   7.6s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=3; total time=   8.3s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=5; total time=   7.8s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   5.1s
[CV] END l2_regularization=0, learning_rate=0.05, max_depth=None; total time=   9.0s
[CV] END l2_regularization

100%|██████████| 10/10 [31:39<00:00, 189.92s/it]

[CV] END l2_regularization=0.1, learning_rate=0.3, max_depth=None; total time=   7.1s





In [18]:
optimal_parameters_tests

{'LABEL_BaseExcess': (0.9342983600778322,
  {'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_depth': 5}),
 'LABEL_Fibrinogen': (0.8128441521536149,
  {'l2_regularization': 0.1, 'learning_rate': 0.2, 'max_depth': 3}),
 'LABEL_AST': (0.7524438831358076,
  {'l2_regularization': 0, 'learning_rate': 0.05, 'max_depth': None}),
 'LABEL_Alkalinephos': (0.7570558914238925,
  {'l2_regularization': 0.1, 'learning_rate': 0.05, 'max_depth': None}),
 'LABEL_Bilirubin_total': (0.7530578825813742,
  {'l2_regularization': 0.1, 'learning_rate': 0.05, 'max_depth': None}),
 'LABEL_Lactate': (0.8127950310067316,
  {'l2_regularization': 0.01, 'learning_rate': 0.1, 'max_depth': 3}),
 'LABEL_TroponinI': (0.9002289805428259,
  {'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_depth': 5}),
 'LABEL_SaO2': (0.8377286252938518,
  {'l2_regularization': 0.01, 'learning_rate': 0.1, 'max_depth': 5}),
 'LABEL_Bilirubin_direct': (0.7849908940572896,
  {'l2_regularization': 0.1, 'learning_rate': 0.05, 'max_depth

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm import tqdm

params = {
    "loss": "binary_crossentropy",
    "early_stopping": True,
    "scoring": "roc_auc",
}

hgbc = HistGradientBoostingClassifier(**params)

param_grid = {
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [3, 5, None],
    "l2_regularization": [0, 0.01, 0.1],
}

search = GridSearchCV(hgbc, param_grid, scoring="roc_auc", refit=False, n_jobs=-1, verbose=1)
search.fit(X_train, df_training_labels["LABEL_Sepsis"])

search.best_score_, search.best_params_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


(0.736347619274185,
 {'l2_regularization': 0.01, 'learning_rate': 0.1, 'max_depth': 3})

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from tqdm import tqdm

params = {
    "loss": "squared_error",
    "early_stopping": True,
    "scoring": "r2",
}

hgbr = HistGradientBoostingRegressor(**params)

param_grid = {
    "learning_rate": [0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 5, None],
    "l2_regularization": [0, 0.01, 0.1],
}

optimal_parameters_vitals = dict.fromkeys(VITALS)

for target in tqdm(optimal_parameters_vitals.keys()):
    search = GridSearchCV(hgbr, param_grid, scoring="r2", refit=False, n_jobs=-1, verbose=1)
    search.fit(X_train, df_training_labels[target])
    optimal_parameters_vitals[target] = search.best_score_, search.best_params_

  0%|          | 0/4 [00:00<?, ?it/s]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 25%|██▌       | 1/4 [03:34<10:43, 214.53s/it]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 50%|█████     | 2/4 [08:26<08:40, 260.37s/it]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 75%|███████▌  | 3/4 [11:53<03:55, 235.95s/it]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


100%|██████████| 4/4 [16:32<00:00, 248.24s/it]


In [16]:
optimal_parameters_vitals

{'LABEL_RRate': (0.4460806715384072,
  {'l2_regularization': 0, 'learning_rate': 0.05, 'max_depth': 5}),
 'LABEL_ABPm': (0.640279410412732,
  {'l2_regularization': 0, 'learning_rate': 0.05, 'max_depth': None}),
 'LABEL_SpO2': (0.4190096673787148,
  {'l2_regularization': 0.01, 'learning_rate': 0.1, 'max_depth': 5}),
 'LABEL_Heartrate': (0.6756403531004095,
  {'l2_regularization': 0, 'learning_rate': 0.05, 'max_depth': 5})}