In [1]:
import pandas as pd
import pickle

from sklearn.preprocessing import RobustScaler

# Features

In [2]:
to_standardize = ['age', 'chloride_input_meq', 'fluid_net_input_ml', 'heartrate_max',
                  'sysbp_min', 'diasbp_min', 'resprate_max', 'weight', 'mingcs', 'day_1_chl']

df_train = pd.read_csv('../data_collection/train.csv')
df_train = df_train.set_index(['subject_id', 'hadm_id', 'icustay_id'], verify_integrity=True)
df_train = df_train.drop(columns=['sodium_max', 'bicarbonate_min'])
df_train = df_train.rename(columns={'chloride_max': 'day_1_chl'})
df_train = df_train.fillna(0)  # fill comorbidities
df_test = pd.read_csv('../data_collection/test.csv')
df_test = df_test.set_index(['subject_id', 'hadm_id', 'icustay_id'], verify_integrity=True)
df_test = df_test.drop(columns=['sodium_max', 'bicarbonate_min'])
df_test = df_test.rename(columns={'chloride_max': 'day_1_chl'})
df_test = df_test.fillna(0)  # fill comorbidities
print "Rows in df_train: %s" % len(df_train)
print "Rows in df_test: %s" % len(df_test)

scaler = RobustScaler().fit(df_train[to_standardize])
df_train[to_standardize] = scaler.transform(df_train[to_standardize])
df_test[to_standardize] = scaler.transform(df_test[to_standardize])

if len(df_train.columns) != len(df_test.columns):
    raise ValueError("Inconsistent feature columns")
print "Features: %s" % len(df_train.columns)

Rows in df_train: 23330
Rows in df_test: 9999
Features: 36


# Outcomes
Use 2nd day outcomes.

In [3]:
# Chloride data from day 2
df = pd.read_csv('../data_collection/chloride.csv')
df = df.query('icu_day == 2 & chloride_max.notnull()')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'], verify_integrity=True)
df_chloride = df.filter(['chloride_max'])
df_chloride['chl_110'] = (df_chloride['chloride_max'] >= 110).astype(int)
print "Rows in df_chloride: %s" % len(df_chloride)

Rows in df_chloride: 35942


In [4]:
df_train = df_train.join(df_chloride, how='left')
df_test = df_test.join(df_chloride, how='left')

print "Training rows w/ no chloride data on day 2: %s" % df_train['chl_110'].isna().sum()
print "Testing rows w/ no chloride data on day 2: %s" % df_test['chl_110'].isna().sum()
df_train = df_train.fillna({'chl_110': 0})
df_test = df_test.fillna({'chl_110': 0})

print "Training set: %5s rows (%4s/%.2f%% hyperchloremic on day 2)" % \
    (len(df_train), int(sum(df_train['chl_110'])), 100*float(sum(df_train['chl_110']))/len(df_train))
print "Testing set : %5s rows (%4s/%.2f%% hyperchloremic on day 2)" % \
    (len(df_test), int(sum(df_test['chl_110'])), 100*float(sum(df_test['chl_110']))/len(df_test))
print "Total set   : %5s rows (%4s/%.2f%% hyperchloremic on day 2)" % \
    (len(df_train)+len(df_test), int(sum(df_train['chl_110'])+sum(df_test['chl_110'])),
     100*float(sum(df_train['chl_110'])+sum(df_test['chl_110']))/(len(df_train)+len(df_test)))

Training rows w/ no chloride data on day 2: 8459
Testing rows w/ no chloride data on day 2: 3513
Training set: 23330 rows (1362/5.84% hyperchloremic on day 2)
Testing set :  9999 rows ( 630/6.30% hyperchloremic on day 2)
Total set   : 33329 rows (1992/5.98% hyperchloremic on day 2)


In [5]:
df_train.to_csv('train_data.csv')
df_test.to_csv('test_data.csv')

with open('./error_analysis/scaler.pickle', 'wb') as f:
    pickle.dump((to_standardize, scaler), f)