In [1]:
import scipy.stats as ss, pandas as pd, numpy as np

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

In [2]:
N = 15
SEED = 1

class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

*   Prepare a list of ```{parameters : values}``` from a predetermined set

In [9]:
########################################################
################### CLASSIFICATION #####################
########################################################
# Logisitc Regression
LR_dist = DictDist({
    'C': Choice(np.geomspace(1e-3, 1e3, 10000)),
    'penalty': Choice(['l1', 'l2']),
    'solver': Choice(['lbfgs']),
    'max_iter': Choice([500])
})
np.random.seed(SEED)
LR_hyperparams_list = LR_dist.rvs(N)
for i in range(N):
    if LR_hyperparams_list[i]['solver'] == 'lbfgs': LR_hyperparams_list[i]['penalty'] = 'l2'

# Random Forest
RF_dist = DictDist({
    'n_estimators': ss.randint(50, 500),
    'max_depth': ss.randint(2, 10),
    'min_samples_split': ss.randint(2, 75),
    'min_samples_leaf': ss.randint(1, 50),
})
np.random.seed(SEED)
RF_hyperparams_list = RF_dist.rvs(N)

# Support Vector
SV_dist = DictDist({
    'C': Choice([0.1]),
    'class_weight': Choice(['balanced']), 
    'verbose': Choice([0]),
    'probability': Choice([True]),
})
np.random.seed(SEED)
SV_hyperparams_list = SV_dist.rvs(1)


########################################################
##################### PREDICTION #######################
########################################################
# XGBRegressor
XG_dist = DictDist({
    'n_estimators': ss.randint(50, 500),
    'eta': Choice([0.1]),
    'verbosity': Choice([0]),
    'max_depth': ss.randint(2, 10),
})
np.random.seed(SEED)
XG_hyperparams_list = XG_dist.rvs(N)

In [4]:
# LOAD DATA

x_train = pd.read_csv('../data/X_train.csv', index_col=0, header=[0, 1, 2])
x_valid = pd.read_csv('../data/X_valid.csv', index_col=0, header=[0, 1, 2])
x_test = pd.read_csv('../data/X_test.csv', index_col=0, header=[0, 1, 2])
y_train_cls = pd.read_csv('../data/Y_train.csv') 
y_valid_cls = pd.read_csv('../data/Y_valid.csv') 
y_train_los = pd.read_csv('../LoS/Y_train.csv')
y_valid_los =  pd.read_csv('../LoS/Y_valid.csv')

In [None]:
# PREPROCESSING 

nulls_data = pd.DataFrame()
for col in x_train.columns.get_level_values('LEVEL2').unique():
    nulls_data[col] = x_train.loc[:, (col, 'mask')].sum(axis=1).replace({0: np.nan})

def filter_min_max(data):
    filtered_data = pd.DataFrame()
    for col in  nulls_data.columns[nulls_data.isna().sum() <= 5000]:
        filtered_data[col+'_min'] = data.loc[:, (col, 'mask')].multiply(data.loc[:, (col, 'mean')]).min(axis=1)
        filtered_data[col+'_max'] = data.loc[:, (col, 'mask')].multiply(data.loc[:, (col, 'mean')]).max(axis=1)
    return filtered_data.copy()

X_train = filter_min_max(x_train)
X_valid = filter_min_max(x_valid)
X_test = filter_min_max(x_test)

In [6]:
def run_basic(model, hyperparams_list, X_flat_train, X_flat_dev, X_flat_test, target):
    if target == "los_icu":
        y_train = y_train_los
        y_valid = y_valid_los
    elif target == "mort_icu":
        y_train = y_train_cls
        y_valid = y_valid_cls
    else: 
        raise SystemExit()

    best_s, best_hyperparams = -np.Inf, None
    for i, hyperparams in enumerate(hyperparams_list):
        print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))
        pipeline = make_pipeline(SimpleImputer(), StandardScaler(), model(**hyperparams))
        pipeline.fit(X_flat_train, y_train[target])

        s = roc_auc_score(y_valid[target], pipeline.predict_proba(X_flat_dev)[:, 1])
        if s > best_s:
            best_s, best_hyperparams = s, hyperparams
            print("New Best Score: %.2f @ hyperparams = %s" % (100*best_s, repr((best_hyperparams))))

    return run_only_final(model, best_hyperparams, X_flat_train, X_flat_dev, X_flat_test, y_train, y_valid, target)


def run_only_final(model, best_hyperparams, X_flat_train, X_flat_dev, X_flat_test, y_train, y_valid, target):
    pipeline = make_pipeline(SimpleImputer(), StandardScaler(), model(**best_hyperparams))
    pipeline.fit(pd.concat((X_flat_train, X_flat_dev)), pd.concat((y_train, y_valid))[target])
    
    y_score = pipeline.predict_proba(X_flat_test)[:, 1]
    y_pred  = pipeline.predict(X_flat_test)

    print("===== score ===== \n", y_score)
    print("===== pred  ===== \n", y_pred)
    
    return y_score, y_pred

In [None]:
# Classification

for model, params in [
        (SVC, SV_hyperparams_list), 
        (RandomForestClassifier, RF_hyperparams_list), 
        (LogisticRegression, LR_hyperparams_list)
]:
    (score, pred) = run_basic(model, params, X_train, X_valid, X_test, 'mort_icu')