In [None]:
import os
import sys
import re
from tempfile import mkdtemp

print(sys.version_info)

home_dir = os.getenv("HOME")
print(os.getenv("PYTHONPATH"))

import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import hashlib

from sklearn import metrics, tree
from sklearn.preprocessing import StandardScaler, Imputer, LabelEncoder
from sklearn.metrics import precision_recall_curve, average_precision_score, brier_score_loss, make_scorer
from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, ARDRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib

#from fancyimpute import SoftImpute
from ehr_utils import *
#from xgboost import XGBClassifier

# Configurations
- main_filtered_f - this is the "main" data file, which we assume contains all features needed for prediction

- test_or_case_id_f - this file contains all the OR_CASE_IDs that we want to predict the mortality risk for

- exp_prefix - this corresponds to one of the directories in "paper" and this determines which set of features to use to predict mortality. 

- dir_to_save_files - this should usually be the full path to the directory that contains EHR_MAIN_FEATURES.csv and this is where all output results will be saved

- data_dir - this directory contains a file per each categorical variable, and each file contains the allowable values for each variable

- model_dir - this is the directory that contains the binary ML models

In [None]:
# set to False to hide IDs    
verbose = False

# version of scikit-learn that was used to create model (should be in pickled model filename)
sk_version = "0.21.3"

## this variable is the column that we will use as the target variable for the model
TARGET_VARIABLE = 'INPT_DEATH_YN'

MIN_ASA_STATUS=1
MAX_ASA_STATUS=5
MIN_AGE=18
MAX_AGE=89

# PATH SETTINGS
#main_filtered_f = "main_Nov21_2017_Feb_13_2018.filtered.main.txt"
main_filtered_f = "/opt/data/workingdir/blhill/main_merged_w_akin_spo2.filtered.main.txt"
#main_filtered_f = "/opt/genomics/workingdir/blhill/test_main_3.txt"
test_or_case_id_f = "/opt/data/workingdir/blhill/or_case_id_032018.txt"
#main_filtered_f = "/opt/genomics/workingdir/blhill/vali_test.txt"

# directory containing code repo
repo_dir = "/opt/data/workingdir/blhill/code/github/PreopMortalityPrediction"

# experiment prefix sets the set of features to use in the model
exp_prefix = "preop_no_lab_times"
dir_to_save_files = os.path.join(repo_dir, "paper", exp_prefix)

# directory containing info about acceptable data
data_dir = os.path.join(repo_dir, "data")

## set path to directory containing pickled models
#model_dir = os.path.join("/opt/genomics/workingdir/blhill/mortality_models", exp_prefix)
model_dir = dir_to_save_files

if not os.path.exists(dir_to_save_files):
    os.makedirs(dir_to_save_files)

# Read in data to data frame

In [None]:
df = pd.read_csv(os.path.join(home_dir, main_filtered_f), sep="|", header=0)
print(df.shape)
print(len(df.columns))
if verbose:
    display(df.iloc[0:20, :])
    
# make sure INPT_DEATH_YN is set to boolean
df[TARGET_VARIABLE] = df[TARGET_VARIABLE].astype(bool)

# hash OR_CASE_ID values and take only patients after March 2018

In [None]:
def get_sha256_hash(x):
    m = hashlib.sha256()
    m.update(str(x).encode('utf-8'))
    return m.hexdigest().upper()

df["OR_CASE_ID"] = df["OR_CASE_ID"].apply(get_sha256_hash)

In [None]:
# # get list of surgeries for testing
test_or_case_ids = pd.read_csv(test_or_case_id_f, header=None)

In [None]:
df = df[df["OR_CASE_ID"].isin(test_or_case_ids.iloc[:,0])]

# drop any rows that are exact copies of another row

In [None]:
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

# read in features file

In [None]:
FEATURES_PATH = os.path.join(dir_to_save_files, 'EHR_MAIN_FEATURES.csv')
features_df = pd.read_csv(FEATURES_PATH)

features_dict = {name:list(col.dropna()) for name,col in features_df.items()}
print(features_dict.keys())

final_features = features_dict['final_features']
cat_to_drop = features_dict['cat_to_drop']
outcome_vars = features_dict['outcome_vars']
feat_to_drop = features_dict['feat_to_drop']
cat_vars = features_dict['cat_vars']
contin_vars = features_dict['contin_vars']
bool_outcome_vars = features_dict['bool_outcome_vars']

In [None]:
df[contin_vars] = df[contin_vars].astype(float)
df[cat_vars] = df[cat_vars].astype(object)
df["ASA_STATUS"] = df["ASA_STATUS"].astype(float)

In [None]:
df.head(10)

In [None]:
# case_list = [553325]
# df = df[df.OR_CASE_ID.isin(case_list)]

In [None]:
pd.options.display.max_columns = 1000

In [None]:
df

In [None]:
df[final_features].isna().sum().sum()

In [None]:
df[final_features]

In [None]:
#df.apply(lambda x: get_sha256_hash(tuple(x[final_features])), axis = 1)

In [None]:
# remove HRS_ADMSN_TO_SURGERY from feature list
try:
    final_features.remove('HRS_ADMSN_TO_SURGERY')
except ValueError:
    pass

if 'HRS_ADMSN_TO_SURGERY' not in feat_to_drop:
    feat_to_drop = feat_to_drop.append('HRS_ADMSN_TO_SURGERY')

# Get acceptable values for categorical variables and filter

In [None]:
def read_acceptable_vals(filename):
    with open(os.path.join(data_dir, filename)) as f:
        return [l.strip() for l in f.readlines()]

pre_surg_location_vals = read_acceptable_vals("PRE_SURG_LOCATION_unique_values.txt")
pat_class_vals = read_acceptable_vals("PAT_CLASS_unique_values.txt")
hcup_code_vals = read_acceptable_vals("HCUP_CODE_unique_values.txt")
gender_vals = read_acceptable_vals("GENDER_unique_values.txt")
case_srv_name_vals = read_acceptable_vals("CASE_SRV_NAME_unique_values.txt")

In [None]:
print(case_srv_name_vals)

In [None]:
df = df[df.PRE_SURG_LOCATION.isin(pre_surg_location_vals)]
print(df.shape)
df = df[df.PAT_CLASS.isin(pat_class_vals)]
print(df.shape)
df = df[df.HCUP_CODE.astype(float).isin(hcup_code_vals)]
print(df.shape)
df = df[df.GENDER.isin(gender_vals)]
print(df.shape)
df = df[df.CASE_SRV_NAME.isin(case_srv_name_vals)]
print(df.shape)

# Filter out surgeries that don't occur in RR or SM operating rooms

In [None]:
print(df.LOCATION_GROUP.unique())
#df = df[df['LOCATION_GROUP'].isin(['RR OR', 'SM OR','SM SC','SM OB OR','RR OB OR'])]
print(df.shape)
df = df[df['LOCATION_GROUP'].isin(['RR OR', 'SM OR'])]
print(df.shape)

# Filter out surgeries that were not INPATIENT, SAME DAY ADMIT, EMERGENCY, or OVERNIGHT RECOVERY

In [None]:
print("Shape before filtering out outpatient surgeries:", df.shape)
#df = df[df['PATIENT_CLASS'].isin(['INPATIENT', 'SAME DAY ADMIT', 'EMERGENCY', 'OVERNIGHT RECOVERY'])]
df = df[df['PAT_CLASS'].isin(['INPATIENT', 'SAME DAY ADMIT', 'EMERGENCY', 'OVERNIGHT RECOVERY'])]
print("Shape after filtering out outpatient surgeries:", df.shape)

# Filter based on ASA status, age

In [None]:
try:
    print("Shape before filtering out based on ASA_STATUS:", df.shape)
    print("ASA_STATUS mean:", df.ASA_STATUS.mean())
    df = df[(df["ASA_STATUS"] <= MAX_ASA_STATUS) & (df["ASA_STATUS"] >= MIN_ASA_STATUS)]
    print("Shape after filtering out based on ASA_STATUS:", df.shape)
    print("ASA_STATUS mean:", df.ASA_STATUS.mean())
except AttributeError:
    pass
print("===================================")
print("Mean age:", df.AGE_LT_90.mean())
print("STD age:", df.AGE_LT_90.std())
df = df[(df["AGE_LT_90"] <= MAX_AGE) & (df["AGE_LT_90"] >= MIN_AGE)]
print("Mean age:", df.AGE_LT_90.mean())
print("STD age:", df.AGE_LT_90.std())
print("Shape after filtering out based on AGE_LT_90:", df.shape)

# check demographic distributions

In [None]:
# print "Number of Patients:", df.shape[0]
# print "Patients with in-hospital mortality: {} ({}%)".format(df.INPT_DEATH_YN.value_counts()[1], df.INPT_DEATH_YN.value_counts(normalize="True")[1]*100)
# print "Mean age:", df.AGE_LT_90.mean(), " std:", df.AGE_LT_90.std()
print("Number of female patients: {} ({}%)".format(df[df["GENDER"] == "F"].shape[0], df[df["GENDER"] == "F"].shape[0]/float(df.shape[0])*100))

# try:
#     print "Number of patients in RR OR: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["RR OR"], df.LOCATION_GROUP.value_counts(normalize="True")["RR OR"]*100)
#     print "Number of patients in SM OR: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["SM OR"], df.LOCATION_GROUP.value_counts(normalize="True")["SM OR"]*100)
#     #print "Number of patients in SM SC: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["SM SC"], df.LOCATION_GROUP.value_counts(normalize="True")["SM SC"]*100)
# except AttributeError:
#     pass
# print("="*40)
# try:
#     print "ASA Status:", df.ASA_STATUS.value_counts()
#     print "ASA Status (%):", (df.ASA_STATUS.value_counts()/df.shape[0])*100
# except AttributeError:
#     pass

# print("="*40)
# print "Mean age of mortalities:", df[df["INPT_DEATH_YN"] == 1].AGE_LT_90.mean(), " std:", df[df["INPT_DEATH_YN"] == 1].AGE_LT_90.std()
# print("Number of female mortalities: {} ({}%)".format(df[df["GENDER"] == "F"]["INPT_DEATH_YN"].sum(), 
#                                                      df[df["GENDER"] == "F"]["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100))
# print("Number of male mortalities: {} ({}%)".format(df[df["GENDER"] == "M"]["INPT_DEATH_YN"].sum(), 
#                                                      df[df["GENDER"] == "M"]["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100))
# try:
#     print("="*40)
#     print("Number of mortalities stratified by location")
#     print(df.groupby("LOCATION_GROUP")["INPT_DEATH_YN"].sum())                                
#     print(df.groupby("LOCATION_GROUP")["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100)
# except AttributeError:
#     pass
# print("="*40)
# print("Number of mortalities stratified by ASA status")
# print(df.groupby("ASA_STATUS")["INPT_DEATH_YN"].sum())
# print(df.groupby("ASA_STATUS")["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100)

# if verbose:
#     print(df["CASE_SRV_NAME"].value_counts())
#     print(df["CASE_SRV_NAME"].value_counts()/df.shape[0])*100
#     print("="*40)
#     print(df[df["INPT_DEATH_YN"] == 1]["CASE_SRV_NAME"].value_counts())
#     print(df[df["INPT_DEATH_YN"] == 1]["CASE_SRV_NAME"].value_counts()/df["INPT_DEATH_YN"].sum())*100

# Remove outlier values

In [None]:
# string_cols = ['PRE_SURG_LOCATION', 'CASE_SRV_NAME_GROUP', 'CASE_SRV_NAME', 'PRIMARY_CPT',
#                                   'GENDER', 'HCUP_DESC', 'CPT_DESC', 'PAT_CLASS', 'OR_CASE_ID', 'ADMSN_ID']
# dff = df.drop(['PRE_SURG_LOCATION', 'CASE_SRV_NAME_GROUP', 'CASE_SRV_NAME', 'PRIMARY_CPT',
#                                   'GENDER', 'HCUP_DESC', 'CPT_DESC', 'PAT_CLASS', 'OR_CASE_ID', 'ADMSN_ID'], axis=1)
# #dff = df.select_dtypes(include=['float64'])
# dff = df[contin_vars]
# df_string_cols = df[df.columns.difference(dff.columns.values)]
# print df_string_cols.columns.values
# #display(dff.describe())
# print (np.abs(st.zscore(dff, axis=1)) > 3)
# #print dff.sub(dff.mean()).div(dff.std()).abs().lt(3)
# df_no_outliers = dff[dff.sub(dff.mean()).div(dff.std()).abs().lt(4)]
# df_no_outliers[df_string_cols.columns.values] = df_string_cols
# if verbose:
#     display(df_no_outliers.describe(include="all"))
# df = df_no_outliers

# Remove variables related to lab times (i.e. *.HRS_2_SURGERY)

In [None]:
#remove variables that have to do with time
# cols_to_keep_no_hrs2surgery = [c for c in df.columns if not c.endswith(".HRS_2_SURGERY")]
# print cols_to_keep_no_hrs2surgery
# print len(cols_to_keep_no_hrs2surgery)
# df=df[cols_to_keep_no_hrs2surgery]

# Remove unnecessary features 

In [None]:
# save this for checking predictions over time
admsn_surgery_number = df["ADMSN_SURGERY_NUMBER"]
print(admsn_surgery_number.shape)
or_case_id_number = df["OR_CASE_ID"]
admsn_ids = df['ADMSN_ID']

In [None]:
df = df[final_features + [TARGET_VARIABLE]]

# One-hot encode categorical variables

In [None]:
for var in cat_vars:
    try:
        # drop_first uses k-1 dummies out of k categories
        print(var)
        #df = pd.get_dummies(df, columns=[var], drop_first=True)
        df = pd.get_dummies(df, columns=[var])
        pass
    except ValueError:
        pass
    except KeyError:
        print(var, 'already dropped')
# remove categorical variables (string values)
for var in cat_vars:
    try:
        df.drop(var, axis=1, inplace=True)
        pass
    except ValueError:
        print(var, 'already dropped')
    except KeyError:
        print(var, 'already dropped')

# Remove features we don't want to include

In [None]:
# if len(feature_whitelist) == 0:
for cat in cat_to_drop:
    try:
        df.drop(cat, axis=1, inplace=True)
    except KeyError:
        print(cat, 'already dropped')
#print df.columns.values
for col in sorted(df.columns.values):
    print(col, "\t\t", df[col].dtype)

# Remove target variables from data frame

In [None]:
print(df[TARGET_VARIABLE].mean())
print("Column names:", df.columns.values)
try:
    y = np.ravel(df[TARGET_VARIABLE])
    #asa_status = df["ASA_STATUS"]
    df.drop(TARGET_VARIABLE, axis=1, inplace=True, errors='ignore')
    df.drop(outcome_vars, axis=1, inplace=True, errors='ignore')
    input_death_yn = df['INPT_DEATH_YN']
except KeyError:
    print(TARGET_VARIABLE, "already dropped")
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# default stragegy: mean
# if len(feature_whitelist) > 0:
#     feature_whitelist = [c for c in feature_whitelist if not c.endswith(".HRS_2_SURGERY")]
#     df = df[feature_whitelist]
print(df.isnull().sum())
print(y.sum())

# Make sure we have all features that we used to train model

In [None]:
final_feature_list = pd.read_csv(os.path.join(dir_to_save_files, exp_prefix + "_final_feature_list.txt"), header=None)
print(final_feature_list.shape)

In [None]:
for col in final_feature_list[0]:
    if col not in df.columns.values:
        print(col, "missing from dataframe")
        df[col] = np.nan

In [None]:
print(final_feature_list[0].shape)

In [None]:
df.columns.values

In [None]:
df = df[final_feature_list[0]]

# Standardize training, testing data

In [None]:
class StandardizeWithNaN(TransformerMixin, BaseEstimator):
    '''This estimator is for standardizing a dataset that has missing data'''
    def __init__(self):
        self.X_mean = []
        self.X_std = []
        pass

    def fit(self, X, y=None):
        # get mean and standard deviation of columns
        self.X_mean = np.nanmean(X, axis=0)
        self.X_std  = np.nanstd(X, axis=0) 
        return self

    def transform(self, X):
        # subtract mean and divide by standard deviation
        return (X - self.X_mean)/self.X_std

In [None]:
#scaler = StandardizeWithNaN()
# scaler.fit(X_test)
# X_test = scaler.transform(X_test)
#scaler.fit(df)
scaler = pickle.load(open(os.path.join(model_dir, "StandardizeWithNaN.pkl"), "rb"))
X_test = scaler.transform(df)
y_test = y

# Impute missing values

In [None]:
class SoftImputeEstimator(TransformerMixin, BaseEstimator):
    '''This estimator is for wrapping the SoftImpute algorithm'''
    def __init__(self, max_iters=200, verbose=True):
        self.max_iters = max_iters
        self.verbose = verbose
        self.fit_count = 0
        self.transform_count = 0
        pass

    def fit(self, X, y=None):
        self.fit_count += 1
        print("SoftImputeEstimator fit count: {}".format(self.fit_count))
        return self

    def transform(self, X):
        self.transform_count += 1
        print("SoftImputeEstimator transform count: {}".format(self.transform_count))
        try:
            # subtract mean and divide by standard deviation
            return SoftImpute(max_iters=self.max_iters, verbose=self.verbose).complete(X.replace(np.inf, np.nan))
        # ValueError raised if no values need to be imputed
        except ValueError:
            return np.array(X)

In [None]:
print("imputing X_test")
#print(np.isnan(X_test).any())
#si = SoftImputeEstimator()
si = pickle.load(open(os.path.join(model_dir, "MeanImputer.pkl"), "rb"))
si.statistics_[np.isnan(si.statistics_)] = 0.
print(si.statistics_)
X_test = si.transform(X_test.replace(np.inf, np.nan))

# Load model(s)

In [None]:
# models = {}
models = []
for i in range(1):
    model_file_name = "Random Forest_train_sk{}.pkl".format(sk_version)
    model = pickle.load(open(os.path.join(model_dir, model_file_name), "rb"), encoding='latin1')
    print("Loaded", model_file_name)
    #models["fold_{}".format(i)] = model
    models.append(model)

In [None]:
models[0]

# Predict classes and get probability of labels

In [None]:
# model_predictions = {k: model.predict(np.array(X_test)) for k, model in models.items()}
# model_probs = {k: model.predict_proba(X_test) for k, model in models.items()}

model_predictions = [model.predict(np.array(X_test)) for model in models]
model_probs = [model.predict_proba(X_test) for model in models]

In [None]:
# model_probs_df = pd.DataFrame.from_dict({k: probs[:,1] for k, probs in model_probs.items()})
# model_probs_df.std(axis=1).plot()
# plt.show()

# If we have true labels, see how well the model did 

In [None]:
model_names = ["Random Forest"]

In [None]:
#plot_cross_val_roc_curve(model_probs, y_test, "test_cross_val_roc.png")

In [None]:
#plot_accuracy_roc_auc(models, model_names, model_predictions, model_probs, y_train, y_test, "test_accuracy_roc_auc.tif")

In [None]:
plot_roc_curve(models, model_names, model_probs, y_test, os.path.join(dir_to_save_files,"test_roc_curve.png"))

In [None]:
plot_precision_recall_curve(models, model_names, model_probs, y_test, os.path.join(dir_to_save_files, "test_precision_recall_curve.png"))

In [None]:
len(model_probs[0])
len(or_case_id_number)

In [None]:
if verbose:
    for i in zip(model_probs[0][:,1], or_case_id_number, y):
        print(i)

In [None]:
#plt.hist(model_probs[0][y == False,1], bins=20)
plt.hist(model_probs[0][y == True,1], bins=20)
plt.xlabel("Probability")
plt.ylabel("Patient Count")
plt.show()

In [None]:
print(model_probs[0][y == False,1].mean())
print(model_probs[0][y == False,1].std())
print(model_probs[0][y == True,1].mean())
print(model_probs[0][y == True,1].std())