# Supervised Learning of Actions - Logistic Regression
This note fits a logistic regression model to the sepsis data.

In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
exportdir='/data/localhost/taufiq/export-dir/'

In [2]:
import logging
logging.basicConfig(filename='logistic-regression.log', level=logging.INFO)

In [3]:
nra = 5
nr_reps = 50

In [4]:
MIMICtable = pd.read_csv(exportdir + '/MIMIC-1hourly-length-2.csv')
MIMICtable.head()

Unnamed: 0,bloc,icustay_id,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,mechvent,Shock_Index,PaO2_FiO2,median_dose_vaso,max_dose_vaso,input_total,input_1hourly,output_total,output_1hourly,cumulated_balance
0,1,65537,7139723000.0,0.0,22799.40478,4.0,1.0,0.0,0.0,0.0,...,0.0,0.6875,363.888889,0.18,0.18,4020.49355,29.994183,7235.0,3605.0,-3184.512267
1,2,65537,7139727000.0,0.0,22799.40478,4.0,1.0,0.0,0.0,0.0,...,0.0,0.602564,430.555556,0.0,0.0,4050.487733,515.407667,10760.0,3525.0,-6194.1046
2,1,3,7245400000.0,0.0,17639.826435,0.0,0.0,0.0,0.0,1.0,...,0.0,0.6639,599.999991,0.0,0.0,6267.0,0.0,9490.0,4705.0,-3223.0
3,2,3,7245403000.0,0.0,17639.826435,0.0,0.0,0.0,0.0,1.0,...,0.0,0.67234,339.999995,0.0,0.0,6267.0,10.0,14095.0,4605.0,-7818.0
4,1,65544,7056917000.0,0.0,24000.4364,3.0,0.0,0.0,0.0,0.0,...,0.0,0.770925,938.636364,0.0,0.0,11332.0,0.0,8210.0,4105.0,3122.0


In [5]:
#################   Convert training data and compute conversion factors    ######################
logging.info('Processing raw data')
# all 47 columns of interest
colbin = ['gender','mechvent','re_admission']
colnorm=['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',\
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',\
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',\
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index','PaO2_FiO2','cumulated_balance']
collog=['SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR','output_total','output_1hourly']

MIMICraw = MIMICtable[colbin+colnorm+collog].copy()

for col in MIMICraw:
    if col in colbin:
        MIMICraw[col] = MIMICraw[col] - 0.5
    elif col in colnorm:
        cmu = MIMICraw[col].mean()
        csigma = MIMICraw[col].std()
        MIMICraw[col] = (MIMICraw[col] - cmu)/csigma
    else:
        log_values = np.log(0.1 + MIMICraw[col])
        dmu = log_values.mean()
        dsigma = log_values.std()
        MIMICraw[col] = (log_values - dmu)/dsigma    
logging.info('Raw data processed')

## Include the information for doses taken until time $t$

In [6]:
logging.info('Adding input values to X')

MIMICraw['last_input'] = 0.0
MIMICraw['total_input_before'] = 0.0
MIMICraw['last_vaso_dose'] = 0.0

for index, row in MIMICtable.iterrows():
    if index < len(MIMICtable) - 1 and (MIMICtable.at[index+1,'icustay_id'] == MIMICtable.at[index,'icustay_id']):
        MIMICraw.at[index+1, 'last_input'] = MIMICtable.at[index, 'input_1hourly']
        MIMICraw.at[index+1, 'total_input_before'] = MIMICtable.at[index, 'input_total']
        MIMICraw.at[index+1, 'last_vaso_dose'] = MIMICtable.at[index, 'max_dose_vaso']

def log_normalise(colname):
    global MIMICraw
    log_values = np.log(0.1 + MIMICraw[colname])
    dmu = log_values.mean()
    dsigma = log_values.std()
    MIMICraw[colname] = (log_values - dmu)/dsigma

log_normalise('last_input')
log_normalise('total_input_before')
MIMICraw['last_vaso_dose'] = MIMICraw['last_vaso_dose'] - 0.5

logging.info('Input values added to X')

In [7]:
MIMICraw.to_csv(exportdir + '/MIMICraw-logistic_reg-1hourly.csv', index = False)
MIMICraw.head()

Unnamed: 0,gender,mechvent,re_admission,age,Weight_kg,GCS,HR,SysBP,MeanBP,DiaBP,...,Creatinine,SGOT,SGPT,Total_bili,INR,output_total,output_1hourly,last_input,total_input_before,last_vaso_dose
0,-0.5,-0.5,0.5,-0.188164,0.977001,0.373302,-0.575778,-0.480229,0.084172,-0.02942,...,-0.258526,4.799837,5.034709,2.516924,-0.69271,0.24821,0.323737,-0.466274,-0.713778,-0.5
1,-0.5,-0.5,0.5,-0.188164,0.645357,0.673538,-0.934044,-0.255514,-0.047822,0.309388,...,-1.255075,-1.15138,-0.792918,-0.71189,0.538002,0.380034,0.315833,1.833942,1.404393,-0.32
2,-0.5,-0.5,-0.5,-1.051718,-0.629896,-0.82764,-0.410425,-0.098214,0.249165,0.444911,...,0.592298,0.646722,0.730208,1.169624,0.160168,0.33832,0.417525,-0.466274,-0.713778,-0.5
3,-0.5,-0.5,-0.5,-1.051718,-0.140325,-0.82764,-0.465543,-0.233043,0.183168,0.37715,...,-0.993943,-0.561408,-0.114526,-0.386553,-0.925742,0.469704,0.409959,-0.466274,1.493079,-0.5
4,-0.5,-0.5,-0.5,0.012852,-0.329836,0.673538,0.002959,-0.412815,-0.608798,-0.673156,...,-0.993943,-0.149953,-0.300658,-1.205811,0.538002,0.290199,0.36948,-0.466274,-0.713778,-0.5


## Create Actions


In [9]:
from sklearn.cluster import KMeans
from scipy.stats import rankdata

logging.info('Creating action bins')
nact = nra**2
input_1hourly_nonzero = MIMICtable.loc[MIMICtable['input_1hourly']>0, 'input_1hourly']
iol_ranked = rankdata(input_1hourly_nonzero)/len(input_1hourly_nonzero) # excludes zero fluid (will be action 1)
iof = np.floor((iol_ranked + 0.2499999999)*4) # converts iv volume in 4 actions
io = np.ones(len(MIMICtable)) # array of ones, by default
io[MIMICtable['input_1hourly']>0] = iof + 1 # where more than zero fluid given: save actual action
vc = MIMICtable['max_dose_vaso'].copy()
vc_nonzero = MIMICtable.loc[MIMICtable['max_dose_vaso']!=0, 'max_dose_vaso']
vc_ranked = rankdata(vc_nonzero)/len(vc_nonzero)
vcf = np.floor((vc_ranked + 0.2499999999)*4) # converts to 4 bins
vcf[vcf==0] = 1
vc[vc!=0] = vcf + 1
vc[vc==0] = 1
# median dose of drug in all bins
ma1 = [MIMICtable.loc[io==1, 'input_1hourly'].median(), MIMICtable.loc[io==2, 'input_1hourly'].median(), MIMICtable.loc[io==3, 'input_1hourly'].median(), MIMICtable.loc[io==4, 'input_1hourly'].median(), MIMICtable.loc[io==5, 'input_1hourly'].median()]
ma2 = [MIMICtable.loc[vc==1, 'max_dose_vaso'].median(), MIMICtable.loc[vc==2, 'max_dose_vaso'].median(), MIMICtable.loc[vc==3, 'max_dose_vaso'].median(), MIMICtable.loc[vc==4, 'max_dose_vaso'].median(), MIMICtable.loc[vc==5, 'max_dose_vaso'].median()]
med = pd.DataFrame(data={'IV':io, 'VC': vc})
med = med.astype({'IV': 'int32', 'VC': 'int32'})
uniqueValues = med.drop_duplicates().reset_index(drop=True)
uniqueValueDoses = pd.DataFrame()
for index, row in uniqueValues.iterrows():
    uniqueValueDoses.at[index, 'IV'], uniqueValueDoses.at[index, 'VC'] = ma1[row['IV']-1], ma2[row['VC']-1]

actionbloc = pd.DataFrame()
for index, row in med.iterrows():
    actionbloc.at[index, 'action_bloc'] = uniqueValues.loc[(uniqueValues['IV'] == row['IV']) & (uniqueValues['VC'] == row['VC'])].index.values[0]+1
actionbloc = actionbloc.astype({'action_bloc':'int32'})

logging.info('Action bins created')

## Fitting models

In [12]:
from sklearn.linear_model import LogisticRegression
icuuniqueids = MIMICtable['icustay_id'].unique()
modelsDf = pd.DataFrame()

logging.info('Fitting models')

for model in tqdm(range(nr_reps)):
    logging.info('Model: ' + str(model))
    grp = np.floor(5*np.random.rand(len(icuuniqueids))+1)
    crossval = 1
    trainidx = icuuniqueids[grp != crossval]
    testidx = icuuniqueids[grp == crossval]
    X = MIMICraw.loc[MIMICtable['icustay_id'].isin(trainidx)]
    Xtestmimic = MIMICraw[MIMICtable['icustay_id'].isin(testidx)]
    blocs = MIMICtable.loc[MIMICtable['icustay_id'].isin(trainidx), 'bloc']
    bloctestmimic = MIMICtable.loc[MIMICtable['icustay_id'].isin(testidx), 'bloc']
    ptid = MIMICtable.loc[MIMICtable['icustay_id'].isin(trainidx), 'icustay_id']
    ptidtestmimic = MIMICtable.loc[MIMICtable['icustay_id'].isin(testidx), 'icustay_id']
    Y = actionbloc.loc[MIMICtable['icustay_id'].isin(trainidx), 'action_bloc']
    Ytest = actionbloc.loc[MIMICtable['icustay_id'].isin(testidx), 'action_bloc']
    clf = LogisticRegression(random_state=0, max_iter=100000).fit(X, Y)
    acc_train = clf.score(X, Y)
    acc_test = clf.score(Xtestmimic, Ytest)
    modelsDf = modelsDf.append({'model': model, 'regressor': clf, 'acc_train': acc_train, 'acc_test': acc_test}, ignore_index=True)
logging.info('Model fitting done!')

100%|██████████| 50/50 [03:24<00:00,  4.09s/it]


In [13]:
modelsDf

Unnamed: 0,acc_test,acc_train,model,regressor
0,0.618385,0.652975,0.0,"LogisticRegression(C=1.0, class_weight=None, d..."
1,0.634802,0.64906,1.0,"LogisticRegression(C=1.0, class_weight=None, d..."
2,0.638714,0.649951,2.0,"LogisticRegression(C=1.0, class_weight=None, d..."
3,0.622422,0.652396,3.0,"LogisticRegression(C=1.0, class_weight=None, d..."
4,0.613359,0.654401,4.0,"LogisticRegression(C=1.0, class_weight=None, d..."
5,0.626842,0.65442,5.0,"LogisticRegression(C=1.0, class_weight=None, d..."
6,0.600126,0.658765,6.0,"LogisticRegression(C=1.0, class_weight=None, d..."
7,0.634677,0.650533,7.0,"LogisticRegression(C=1.0, class_weight=None, d..."
8,0.642241,0.649431,8.0,"LogisticRegression(C=1.0, class_weight=None, d..."
9,0.639192,0.648877,9.0,"LogisticRegression(C=1.0, class_weight=None, d..."


In [35]:
clf.predict_proba(Xtestmimic)[19]

array([9.36346801e-05, 4.65469534e-01, 9.17067293e-03, 1.33221260e-01,
       1.65813031e-01, 2.22008615e-01, 4.42408568e-05, 1.48056069e-04,
       2.53082406e-04, 4.45431535e-04, 4.59199156e-05, 2.96626929e-04,
       1.19000148e-03, 9.18589947e-05, 7.64069487e-04, 3.10141550e-04,
       1.37078966e-04, 8.63979701e-06, 6.82950131e-05, 1.91582252e-06,
       3.56490461e-06, 1.38325531e-05, 2.51828419e-04, 3.65083935e-05,
       1.12159486e-04])

In [37]:
import pickle
filename = '/data/localhost/taufiq/export-dir/logistic-reg-fitted'
pickle.dump(model, open(filename, 'wb'))

In [45]:
actions = pd.concat([MIMICtable, actionbloc],axis=1)

In [63]:
MIMICraw_t0 = MIMICraw[MIMICtable['bloc']==1]
MIMICt0 = MIMICtable[MIMICtable['bloc']==1]
actionbloc_t0 = actionbloc[MIMICtable['bloc']==1]

In [81]:
probs = pd.DataFrame()
for idx in testidx:
    X = MIMICraw_t0.loc[MIMICt0['icustay_id']==idx]
    a = actionbloc_t0.loc[MIMICt0['icustay_id']==idx, 'action_bloc'].values[0]
    prob = clf.predict_proba(X)[0][a-1]
    probs = probs.append({'icustay_id': idx, 'action': a, 'prob': prob}, ignore_index=True)

In [84]:
probs.to_csv("/data/localhost/taufiq/export-dir/probability-data.csv")

In [58]:
len(MIMICraw)

15316