Import Packages

In [1]:
# load data
import matplotlib.pyplot

# Add directory above current directory to path
import sys; sys.path.insert(0, '..')
#from submodules.load_data import load_data

# data manipulation
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# model
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# k-fold cross validation
from sklearn.model_selection import cross_validate

# serializing, compressing, and loading the models
import joblib

# performance
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

# displaying plots
from IPython import display
import matplotlib.image as mpimg
import glob
from skimage.util import montage
import numpy as np

  from pandas import MultiIndex, Int64Index


Load the data

In [3]:
# load the data using a python function
#data = load_data()

# without using a python function
# set for the Patient Vital Signs
csv_path = "../../data/dataSepsis/csv_format/pat_vitals_labeled-dataSepsis.csv"
data = pd.read_csv(csv_path, sep=",")

Split the Dataset

In [9]:
# sets 10%/15%/20% of the data aside for testing, sets the random number generate to it always generates the same shuffled indicies
# x = 2 dimensional array with inputs
# X_train is the training part of the first sequence (x)
# X_test is the test part of the first sequence (x)
# y = 1 dimensional array with outputs
# y_train is the labeled training part of the second sequence
# y_test is the labeled test part of the second sequence
# axis Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’)
# test_size is the amount of the total dataset to set aside for testing = 10%
# random state fixes the randomization so you get the same results each time
# Shuffle before the data is split, it is shuffled
# stratified splitting keeps the proportion of y values trhough the train and test sets
X_train, X_test, y_train, y_test = \
    train_test_split(data.drop(["patient_id","record_date","record_time","EtCO2", "isSepsis"], axis=1),
    data["isSepsis"], test_size=0.20,
    random_state=42, stratify=data["isSepsis"])

Clean the data
1. Transform missing values
    - impute mean, median or other calculation for missing attributes
1. Scale the data
    - ML algorithms don't work well when numeric attributes have very different scales (e.g. HR max 184, pH max 7.67)
    - normalization (MinMaxScaler) bounds the values to a specific range (e.g. 0-1)
    - standardization (StandardScaler) less affected by outliers does not bound to range

Instead of preparing data manually, write functions to:

1. reproduce transformations easily on any dataset (e.g., data refresh)
1. builds a library of functions to reuse in future projects
1. use functions in live stream to transform new data before inferencing

In [12]:
# impute median for missing attributes
# create simpleimputer instance
# replace attributes missing values with median of the attribute
imputer = SimpleImputer(strategy="median")

# fit applies the imputer to ALL numeric data in case new data includes null values
# when system goes live
# results are stored in a imputer.statistics_ value
imputer.fit_transform(X_train)

array([[ 61.  ,  98.5 ,  36.9 , ...,  73.  ,  63.  ,  19.  ],
       [ 76.  ,  98.  ,  36.2 , ...,  86.33,  62.  ,  24.  ],
       [ 75.  ,  99.  ,  36.3 , ...,  70.  ,  62.  ,  16.  ],
       ...,
       [ 83.  ,  95.  ,  38.2 , ...,  66.  ,  48.  ,  18.  ],
       [ 75.  , 100.  ,  38.2 , ...,  74.  ,  56.  ,  13.  ],
       [ 87.  , 100.  ,  37.8 , ...,  59.33,  62.  ,  26.  ]])

In [13]:
# apply the trained imputer to transform the training set replacing the
# missing values with learn medians
N = imputer.transform(X_train)
# result above is plain NumPy array with transformed features
# put back to a pandas DataFrame
M = pd.DataFrame(N, columns=X_train.columns, index=X_train.index)
M.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp
26908,61.0,98.5,36.9,108.0,73.0,63.0,19.0
5174,76.0,98.0,36.2,145.0,86.33,62.0,24.0
15997,75.0,99.0,36.3,96.0,70.0,62.0,16.0
13058,72.0,100.0,36.1,102.0,64.0,49.0,18.0
23132,79.0,95.0,38.3,119.0,92.0,80.0,13.0


In [15]:
# normalization (MinMaxScaler) bounds the values to a specific range (e.g. 0-1)
#standardization (StandardScaler) less affected by outliers does not bound to range
scaler = StandardScaler()

O = scaler.fit_transform(N)
P = pd.DataFrame(O, columns=X_train.columns, index=X_train.index)
P.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp
26908,-0.740025,0.374434,-0.400639,-0.641028,-0.551583,-0.028341,0.190379
5174,-0.17614,0.192386,-0.934153,0.998721,0.252568,-0.109121,1.157798
15997,-0.213733,0.556483,-0.857937,-1.172838,-0.732562,-0.109121,-0.390072
13058,-0.32651,0.92058,-1.010369,-0.906933,-1.094521,-1.15926,-0.003105
23132,-0.063364,-0.899906,0.66639,-0.153535,0.594619,1.344918,-0.970524


Create the pipeline
- Common to apply many transformation steps in a specific order (fill the nulls before you apply the scaling)

In [18]:
# this pipeline should work for all the estimators/algorithms
pipeline = Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('std_scaler', StandardScaler()),
                    ])
# this is the transformed data to train from
X_train_prepared = pipeline.fit_transform(X_train)

In [19]:
# neural networks sometimes expect a 0-1 normalized scale and perform better
pipeline_minmax = Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('minMax', MinMaxScaler()),
                    ])
# this is the transformed data to train the MLP from
X_train_prepared_m = pipeline_minmax.fit_transform(X_train)

In [None]:
# compress and save the pipeline
joblib.dump(pipeline, "../data/transform/pipeline.pkl")
joblib.dump(pipeline_minmax, "../data/transform/pipeline_minmax.pkl")