In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from config import get_base_config
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from collections import defaultdict
import pandas as pd
import numpy as np

root_dir = %pwd
import warnings
warnings.filterwarnings('ignore')


In [2]:
target_col = 'readmitted'
n_seeds = 15

In [3]:
config = get_base_config()

df = pd.read_csv(root_dir + "/data/prepped/prepped_data.csv", index_col=0)
#raw_data = load_data(config)
#prep_pipe = build_data_prep_pipe(config)
#df = pd.concat(prep_pipe.fit_transform(raw_data), axis=1)

df[target_col] = (df[target_col] == 'YES').astype(int)
categorical_cols = [col for col in df.select_dtypes(exclude=['float']).columns if col != target_col]

ohe = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), categorical_cols)],
    remainder='passthrough'
)

ohe = ohe.fit(df)
onehot_target_mask = np.asarray([s.endswith(target_col) for s in ohe.get_feature_names_out()], bool)
assert onehot_target_mask.sum() == 1

In [4]:
aucs = defaultdict(list)
for seed in range(n_seeds):

    train_set, _ = train_test_split(df, test_size=.2, random_state=seed, stratify=df[target_col])

    train_set = ohe.transform(train_set)
    y_train = train_set[:, onehot_target_mask].toarray().astype(int).squeeze()
    X_train = train_set[:, ~onehot_target_mask].toarray()
    del train_set

    for estimator_type in ['RandomForest', 'SVM']:

        if estimator_type == 'RandomForest':
            estimator = RandomForestClassifier(random_state=seed)
        elif estimator_type == 'SVM':
            estimator = LinearSVC(random_state=seed)
        else:
            raise ValueError()

        print(f"Running cross-val for {estimator_type}, Seed {seed}", end="")

        method = 'predict_proba' if hasattr(estimator, 'predict_proba') else 'decision_function'
        y_score = cross_val_predict(estimator, X_train, y_train, cv=5, method=method).squeeze()
        if method == 'predict_proba':
            y_score = y_score[:, -1]

        auc = roc_auc_score(y_true=y_train, y_score=y_score)
        aucs[estimator_type].append(auc)

        print(f" -> AUC={auc:.3f}")

for estimator_type, auc_values in aucs.items():
    print(f"{estimator_type} AUC Mean={np.mean(auc_values):.3f}, SD={np.std(auc_values):.3f}")

Running cross-val for RandomForest, Seed 0 -> AUC=0.600
Running cross-val for SVM, Seed 0 -> AUC=0.580
Running cross-val for RandomForest, Seed 1 -> AUC=0.612
Running cross-val for SVM, Seed 1 -> AUC=0.555
Running cross-val for RandomForest, Seed 2 -> AUC=0.606
Running cross-val for SVM, Seed 2 -> AUC=0.588
Running cross-val for RandomForest, Seed 3 -> AUC=0.606
Running cross-val for SVM, Seed 3 -> AUC=0.561
Running cross-val for RandomForest, Seed 4 -> AUC=0.602
Running cross-val for SVM, Seed 4 -> AUC=0.537
Running cross-val for RandomForest, Seed 5 -> AUC=0.605
Running cross-val for SVM, Seed 5 -> AUC=0.611
Running cross-val for RandomForest, Seed 6 -> AUC=0.605
Running cross-val for SVM, Seed 6 -> AUC=0.543
Running cross-val for RandomForest, Seed 7 -> AUC=0.605
Running cross-val for SVM, Seed 7 -> AUC=0.531
Running cross-val for RandomForest, Seed 8 -> AUC=0.600
Running cross-val for SVM, Seed 8 -> AUC=0.577
Running cross-val for RandomForest, Seed 9 -> AUC=0.604
Running cross-val