Baseline Modeling

Derived from B02: ML Examples

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
#load in the training data
df = pd.read_pickle('demo_training.pkl')

In [3]:
cat_vars = df[['first_wardid','first_careunit','hospital_expire_flag',
                'admission_type','admission_location','insurance','diagnosis']]
cat_dummies = pd.get_dummies(cat_vars, drop_first=True)
cat_dummies.head()

df = df.drop(['first_wardid','first_careunit','hospital_expire_flag',
                'admission_type','admission_location','insurance','diagnosis'], axis=1)
df = pd.concat([df, cat_dummies], axis=1)
df.columns

Index(['los', 'ACET325', 'CALG1I', 'D5W1000', 'D5W250', 'FURO40I', 'HEPA5I',
       'INSULIN', 'KCL20P', 'KCL20PM', 'KCLBASE2', 'LR1000', 'MAG2PM',
       'METO25', 'MORP2I', 'NACLFLUSH', 'NS1000', 'NS250', 'NS500', 'VANC1F',
       'VANCOBASE', 'Dialysis', 'Imaging', 'Intubation/Extubation',
       'Invasive Lines', 'Peripheral Lines', 'Procedures',
       'Significant Events', 'Ventilation', 'first_wardid_52',
       'first_wardid_Other', 'first_careunit_CSRU', 'first_careunit_MICU',
       'first_careunit_SICU', 'first_careunit_TSICU',
       'hospital_expire_flag_1.0', 'admission_type_Other',
       'admission_type_URGENT', 'admission_location_EMERGENCY ROOM ADMIT',
       'admission_location_Other',
       'admission_location_PHYS REFERRAL/NORMAL DELI',
       'admission_location_TRANSFER FROM HOSP/EXTRAM', 'insurance_Medicaid',
       'insurance_Medicare', 'insurance_Other', 'insurance_Private',
       'diagnosis_CV Failure', 'diagnosis_Organ Failure', 'diagnosis_Other',
       '

In [4]:
X = df.drop(columns = ['hospital_expire_flag_1.0'])
y = df['hospital_expire_flag_1.0']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1125, stratify=y)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
pd.Series(y_test).value_counts(normalize=True)

0    0.676471
1    0.323529
Name: hospital_expire_flag_1.0, dtype: float64

In [8]:
sl_modles = dict(
    dummy = DummyClassifier(strategy='prior'),
    knn = KNeighborsClassifier(),
    lr = LogisticRegression(),
    svc = SVC(),
    nn = MLPClassifier(max_iter=500),
)

In [9]:
for name, clf in sl_modles.items():
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f'{name}: {score:.3f}')

dummy: 0.676
knn: 0.971
lr: 1.000
svc: 0.971
nn: 1.000
