In [2]:
import sys
import logging

import numpy as np
import scipy as sp

%reload_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


def get_features(df):
    df.drop("Time", axis=1, inplace=True)
    df_mean = df.groupby("pid", sort=False).mean().add_suffix("_mean")
    df_var = df.groupby("pid", sort=False).var().add_suffix("_var")
    df_min = df.groupby("pid", sort=False).min().add_suffix("_min")
    df_max = df.groupby("pid", sort=False).max().add_suffix("_max")
    df_n_measurements = (
        df.groupby("pid", sort=False)
        .apply(lambda x: 12 - x.isna().sum())
        .add_suffix("_n_meas")
        .drop("pid_n_meas", axis=1)
    )  # Number of measurements
    df_age = df_mean.loc[:, "Age_mean"]

    return pd.concat(
        [df_age, df_mean, df_var, df_min, df_max, df_n_measurements], axis=1
    )


# get data
df_train = pd.read_csv("../train_features.csv")
df_training_labels = pd.read_csv("../train_labels.csv")
df_test = pd.read_csv("../test_features.csv")

# get features
df_train_features = get_features(df_train)
df_test_features = get_features(df_test)

# impute missing values
imputer = SimpleImputer(strategy="median")
imputer.fit(df_train_features)
X_train = imputer.transform(df_train_features)
X_test = imputer.transform(df_test_features)

# center and descale
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# prepare df for predictions
df_pred = pd.DataFrame(columns=df_training_labels.columns)
df_pred["pid"] = df_test_features.index


Subtask 1

In [6]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score

TESTS = [
    "LABEL_BaseExcess",
    "LABEL_Fibrinogen",
    "LABEL_AST",
    "LABEL_Alkalinephos",
    "LABEL_Bilirubin_total",
    "LABEL_Lactate",
    "LABEL_TroponinI",
    "LABEL_SaO2",
    "LABEL_Bilirubin_direct",
    "LABEL_EtCO2",
]

params = {
    "loss": "binary_crossentropy",
    "learning_rate": 0.1,
    "l2_regularization": 0.0,
    "early_stopping": True,
    "scoring": "roc_auc",
}

hgbc = HistGradientBoostingClassifier(**params)

for target in TESTS + ["LABEL_Sepsis"]:
    print(f"CV errors for {target}")
    print(
        cross_val_score(
            hgbc, X_train, df_training_labels[target], scoring="roc_auc", cv=5
        ),
        "\n",
    )



CV errors for LABEL_BaseExcess
[0.93383006 0.92872756 0.93152583 0.93558327 0.93849733] 

CV errors for LABEL_Fibrinogen
[0.79864308 0.80659177 0.80904376 0.79839545 0.8050816 ] 

CV errors for LABEL_AST
[0.76048711 0.7580191  0.7420094  0.74849295 0.74564647] 

CV errors for LABEL_Alkalinephos
[0.76960417 0.75850277 0.74748664 0.7536181  0.75336436] 

CV errors for LABEL_Bilirubin_total
[0.76503722 0.75662011 0.74773635 0.7412922  0.74727235] 

CV errors for LABEL_Lactate
[0.81289042 0.81658908 0.81283765 0.81296717 0.80119153] 

CV errors for LABEL_TroponinI
[0.89995641 0.90351263 0.89731943 0.8963107  0.89989585] 

CV errors for LABEL_SaO2
[0.8440725  0.82647348 0.83108418 0.82276841 0.84138759] 

CV errors for LABEL_Bilirubin_direct
[0.80018004 0.77117631 0.74904421 0.74816552 0.73721775] 

CV errors for LABEL_EtCO2
[0.93805185 0.93726335 0.92872535 0.9498511  0.94334724] 

CV errors for LABEL_Sepsis
[0.71258108 0.72734126 0.74093073 0.75651822 0.69483436] 



In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    "loss": "binary_crossentropy",
    "early_stopping": True,
    "scoring": "roc_auc",
}

hgbc = HistGradientBoostingClassifier(**params)

param_grid = {
    "learning_rate": [0.1, 0.2, 0.5],
    "max_depth": [3, None],
    "l2_regularization": [0, 0.01, 0.1],
}

search = GridSearchCV(hgbc, param_grid, scoring="roc_auc", verbose=True)
search.fit(X_train, df_training_labels[TESTS[0]])


Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(estimator=HistGradientBoostingClassifier(early_stopping=True,
                                                      loss='binary_crossentropy',
                                                      scoring='roc_auc'),
             param_grid={'l2_regularization': [0, 0.01, 0.1],
                         'learning_rate': [0.1, 0.2, 0.5],
                         'max_depth': [3, None]},
             scoring='roc_auc', verbose=True)

In [None]:
search.best_params_


{'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': None}

In [None]:
search.best_score_


0.9330862610364049

In [None]:
from sklearn.svm import SVC

clf = SVC(kernel="rbf", gamma="scale", class_weight="balanced")

# for target in TESTS + ['LABEL_Sepsis']:
#     print(f'CV errors for {target}')
#     print(cross_val_score(clf, X_train, df_training_labels[target], scoring='roc_auc', cv=5), '\n')

# takes ages; dataset probably too big to run this implementation like this


In [None]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log", fit_intercept=False, n_jobs=-1, early_stopping=True)

for target in TESTS + ["LABEL_Sepsis"]:
    print(f"CV errors for {target}")
    print(
        cross_val_score(
            clf, X_train, df_training_labels[target], scoring="roc_auc", cv=5
        ),
        "\n",
    )



CV errors for LABEL_BaseExcess
[0.83164455 0.8602841  0.82329057 0.79464032 0.86610868] 

CV errors for LABEL_Fibrinogen
[0.65954208 0.66302927 0.57255308 0.51897455 0.58915682] 

CV errors for LABEL_AST
[0.64800855 0.61825647 0.61884181 0.61170565 0.62699166] 

CV errors for LABEL_Alkalinephos
[0.65311203 0.5962931  0.66137412 0.5770946  0.64309942] 

CV errors for LABEL_Bilirubin_total
[0.61113812 0.60830941 0.58579653 0.61693017 0.65217813] 

CV errors for LABEL_Lactate
[0.6571548  0.69166597 0.67480205 0.63426768 0.653827  ] 

CV errors for LABEL_TroponinI
[0.6590435  0.75712864 0.71961456 0.79173494 0.69828419] 

CV errors for LABEL_SaO2
[0.76981421 0.73426982 0.68322625 0.72658153 0.65774973] 

CV errors for LABEL_Bilirubin_direct
[0.64554958 0.42442389 0.61756543 0.64364742 0.60336058] 

CV errors for LABEL_EtCO2
[0.82021527 0.82077552 0.81944488 0.85837147 0.85952807] 

CV errors for LABEL_Sepsis
[0.55901113 0.51789156 0.57195981 0.51756211 0.53993042] 



Subtask 3

In [11]:
from sklearn.ensemble import HistGradientBoostingRegressor

VITALS = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]

hgbr = HistGradientBoostingRegressor()

for target in VITALS:
    print(f"CV errors for {target}")
    print(
        cross_val_score(
            hgbr, X_train, df_training_labels[target], scoring="r2", cv=5
        ),
        "\n",
    )



CV errors for LABEL_RRate
[0.43348137 0.42409089 0.40568175 0.4213123  0.41635368] 

CV errors for LABEL_ABPm
[0.6216108  0.62937353 0.63552043 0.58787866 0.61717387] 

CV errors for LABEL_SpO2
[0.38499524 0.34560792 0.36687893 0.39328909 0.38563194] 

CV errors for LABEL_Heartrate
[0.64504049 0.6425895  0.61686668 0.62563988 0.64793856] 



In [13]:
from sklearn.model_selection import GridSearchCV

params = {
    "early_stopping": True,
    "scoring": "r2",
}

hgbr = HistGradientBoostingRegressor(**params)

param_grid = {
    "learning_rate": [0.1, 0.2, 0.5],
    "max_depth": [3, None],
    "l2_regularization": [0, 0.01, 0.1],
}

search = GridSearchCV(hgbr, param_grid, scoring="r2", verbose=True)
search.fit(X_train, df_training_labels[VITALS[0]])


Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(estimator=HistGradientBoostingRegressor(early_stopping=True,
                                                     scoring='r2'),
             param_grid={'l2_regularization': [0, 0.01, 0.1],
                         'learning_rate': [0.1, 0.2, 0.5],
                         'max_depth': [3, None]},
             scoring='r2', verbose=True)

In [14]:
search.best_params_, search.best_score_

({'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_depth': 3},
 0.4211937651213347)

In [20]:
from sklearn.model_selection import GridSearchCV

params = {
    "early_stopping": True,
    "scoring": "r2",
}

hgbr = HistGradientBoostingRegressor(**params, verbose=True)

param_grid = {
    "learning_rate": [0.1, 0.2, 0.5],
    "max_depth": [3, 5],
    "l2_regularization": [0, 0.01, 0.1],
}

search = GridSearchCV(hgbr, param_grid, scoring="r2")
search.fit(X_train, df_training_labels[VITALS[1]])

search.best_params_, search.best_score_

Binning 0.019 GB of training data: 0.362 s
Binning 0.002 GB of validation data: 0.041 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 8 leaves, max depth = 3, train score: 0.10751, val score: 0.11094, in 0.183s
[2/100] 1 tree, 8 leaves, max depth = 3, train score: 0.19493, val score: 0.20121, in 0.056s
[3/100] 1 tree, 8 leaves, max depth = 3, train score: 0.26654, val score: 0.27587, in 0.039s
[4/100] 1 tree, 8 leaves, max depth = 3, train score: 0.32477, val score: 0.33561, in 0.056s
[5/100] 1 tree, 8 leaves, max depth = 3, train score: 0.37206, val score: 0.38542, in 0.114s
[6/100] 1 tree, 8 leaves, max depth = 3, train score: 0.41128, val score: 0.42642, in 0.073s
[7/100] 1 tree, 8 leaves, max depth = 3, train score: 0.44335, val score: 0.46004, in 0.032s
[8/100] 1 tree, 8 leaves, max depth = 3, train score: 0.46994, val score: 0.48741, in 0.057s
[9/100] 1 tree, 8 leaves, max depth = 3, train score: 0.49139, val score: 0.50956, in 0.045s
[10/100] 1 tree, 8 leaves, max depth = 3, 

({'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_depth': 5},
 0.61945466044295)