Baseline Modeling

Derived from B02: ML Examples

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [50]:
#load in the training data
df = pd.read_pickle('demo_training.pkl')

In [51]:
cat_vars = df[['first_wardid','first_careunit','hospital_expire_flag',
                'admission_type','admission_location','insurance','diagnosis']]
cat_dummies = pd.get_dummies(cat_vars, drop_first=True)
cat_dummies.head()

df = df.drop(['first_wardid','first_careunit','hospital_expire_flag',
                'admission_type','admission_location','insurance','diagnosis'], axis=1)
df = pd.concat([df, cat_dummies], axis=1)

In [52]:
X = df.drop(columns = ['hospital_expire_flag_1.0'])
y = df['hospital_expire_flag_1.0']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1125, stratify=y)

In [54]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
pd.Series(y_test).value_counts(normalize=True)

0    0.676471
1    0.323529
Name: hospital_expire_flag_1.0, dtype: float64

In [65]:
sl_modles = dict(
    dummy = DummyClassifier(strategy='prior'),
    knn = KNeighborsClassifier(),
    lr = LogisticRegression(),
    svc = SVC(),
    nn = MLPClassifier(max_iter=500),
    clf=RandomForestClassifier(n_estimators=100),
    xgb = XGBClassifier()
)

In [66]:
for name, clf in sl_modles.items():
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f'{name}: {score:.3f}')

dummy: 0.676
knn: 0.971
lr: 1.000
svc: 0.971
nn: 1.000
clf: 1.000
xgb: 1.000
