# Supervised Learning of Actions - Logistic Regression
This note fits a logistic regression model to the sepsis data.

In [1]:
import pandas as pd
import numpy as np
exportdir='/Users/faaiz/exportdir'

In [2]:
import logging
logging.basicConfig(filename='logistic-regression.log', level=logging.INFO)

In [3]:
nra = 5
nr_reps = 50

In [4]:
MIMICtable = pd.read_csv(exportdir + '/MIMICtable.csv')
MIMICtable.head()

Unnamed: 0,bloc,icustay_id,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,mechvent,Shock_Index,PaO2_FiO2,median_dose_vaso,max_dose_vaso,input_total,input_4hourly,output_total,output_4hourly,cumulated_balance
0,1.0,1006.0,4330306000.0,0.0,25154.170995,3.0,0.0,1.0,1.0,1.0,...,0.0,0.908732,257.5,0.0,0.0,0.0,0.0,8166.0,4083.0,-8166.0
1,3.0,1006.0,4330335000.0,0.0,25154.170995,3.0,0.0,1.0,1.0,1.0,...,0.0,0.728457,294.285714,0.0,0.0,0.0,0.0,12249.0,4083.0,-12249.0
2,6.0,1006.0,4330378000.0,0.0,25154.170995,3.0,0.0,1.0,1.0,1.0,...,0.0,0.912791,104.727273,0.0,0.0,275.0,275.0,15932.0,3683.0,-15657.0
3,1.0,1204.0,4794583000.0,1.0,29424.868472,5.0,0.0,0.0,0.0,1.0,...,0.0,0.504678,116.666667,0.0,0.0,5036.666667,36.666667,5025.0,2445.0,11.666667
4,2.0,1204.0,4794597000.0,1.0,29424.868472,5.0,0.0,0.0,0.0,1.0,...,0.0,0.528184,116.666667,0.0,0.0,5708.720883,672.054217,7140.0,2115.0,-1431.279117


In [5]:
#################   Convert training data and compute conversion factors    ######################
logging.info('Processing raw data')
# all 47 columns of interest
colbin = ['gender','mechvent','re_admission']
colnorm=['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',\
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',\
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',\
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index','PaO2_FiO2','cumulated_balance']
collog=['SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR','output_total','output_4hourly']

MIMICraw = MIMICtable[colbin+colnorm+collog].copy()

for col in MIMICraw:
    if col in colbin:
        MIMICraw[col] = MIMICraw[col] - 0.5
    elif col in colnorm:
        cmu = MIMICraw[col].mean()
        csigma = MIMICraw[col].std()
        MIMICraw[col] = (MIMICraw[col] - cmu)/csigma
    else:
        log_values = np.log(0.1 + MIMICraw[col])
        dmu = log_values.mean()
        dsigma = log_values.std()
        MIMICraw[col] = (log_values - dmu)/dsigma    
logging.info('Raw data processed')

## Include the information for doses taken until time $t$

In [6]:
logging.info('Adding input values to X')

MIMICraw['last_input'] = 0.0
MIMICraw['total_input_before'] = 0.0
MIMICraw['last_vaso_dose'] = 0.0

for index, row in MIMICtable.iterrows():
    if index < len(MIMICtable) - 1 and (MIMICtable.at[index+1,'icustay_id'] == MIMICtable.at[index,'icustay_id']):
        MIMICraw.at[index+1, 'last_input'] = MIMICtable.at[index, 'input_4hourly']
        MIMICraw.at[index+1, 'total_input_before'] = MIMICtable.at[index, 'input_total']
        MIMICraw.at[index+1, 'last_vaso_dose'] = MIMICtable.at[index, 'max_dose_vaso']

def log_normalise(colname):
    global MIMICraw
    log_values = np.log(0.1 + MIMICraw[colname])
    dmu = log_values.mean()
    dsigma = log_values.std()
    MIMICraw[colname] = (log_values - dmu)/dsigma

log_normalise('last_input')
log_normalise('total_input_before')
MIMICraw['last_vaso_dose'] = MIMICraw['last_vaso_dose'] - 0.5

logging.info('Input values added to X')

In [11]:
MIMICraw.to_csv(exportdir + '/MIMICraw-logistic_reg.csv', index = False)
MIMICraw.head()

Unnamed: 0,gender,mechvent,re_admission,age,Weight_kg,GCS,HR,SysBP,MeanBP,DiaBP,...,Creatinine,SGOT,SGPT,Total_bili,INR,output_total,output_4hourly,last_input,total_input_before,last_vaso_dose
0,-0.5,-0.5,-0.5,0.12386,-0.870259,0.477695,0.57958,-0.387404,-0.589746,-0.423179,...,-0.502852,0.013289,-0.722894,-1.088681,1.01587,-0.127732,0.564887,-1.281663,-1.798569,-0.5
1,-0.5,-0.5,-0.5,0.12386,-0.108596,0.477695,0.745287,1.203395,1.488158,1.434798,...,-0.502852,0.276745,0.396998,0.067177,1.01587,0.046116,0.564887,-1.281663,-1.798569,-0.5
2,-0.5,-0.5,-0.5,0.12386,4.297972,0.477695,0.789346,-0.199873,0.280315,0.743097,...,-0.502852,-0.230115,-0.442009,-0.147049,-0.741218,0.158831,0.530796,-1.281663,-1.798569,-0.5
3,0.5,-0.5,-0.5,1.041991,-1.050288,0.477695,-0.842034,1.476385,-0.467412,-1.759465,...,1.651291,0.013289,-0.442009,-0.423234,0.08984,-0.335917,0.395338,-1.281663,-1.798569,-0.5
4,0.5,-0.5,-0.5,1.041991,-1.050288,0.477695,-1.464791,0.055757,-1.281419,-1.83087,...,1.651291,0.013289,-0.442009,-0.423234,0.08984,-0.1853,0.347398,0.247181,0.58598,-0.5


## Create Actions


In [8]:
from sklearn.cluster import KMeans
from scipy.stats import rankdata

logging.info('Creating action bins')
nact = nra**2
input_4hourly_nonzero = MIMICtable.loc[MIMICtable['input_4hourly']>0, 'input_4hourly']
iol_ranked = rankdata(input_4hourly_nonzero)/len(input_4hourly_nonzero) # excludes zero fluid (will be action 1)
iof = np.floor((iol_ranked + 0.2499999999)*4) # converts iv volume in 4 actions
io = np.ones(len(MIMICtable)) # array of ones, by default
io[MIMICtable['input_4hourly']>0] = iof + 1 # where more than zero fluid given: save actual action
vc = MIMICtable['max_dose_vaso'].copy()
vc_nonzero = MIMICtable.loc[MIMICtable['max_dose_vaso']!=0, 'max_dose_vaso']
vc_ranked = rankdata(vc_nonzero)/len(vc_nonzero)
vcf = np.floor((vc_ranked + 0.2499999999)*4) # converts to 4 bins
vcf[vcf==0] = 1
vc[vc!=0] = vcf + 1
vc[vc==0] = 1
# median dose of drug in all bins
ma1 = [MIMICtable.loc[io==1, 'input_4hourly'].median(), MIMICtable.loc[io==2, 'input_4hourly'].median(), MIMICtable.loc[io==3, 'input_4hourly'].median(), MIMICtable.loc[io==4, 'input_4hourly'].median(), MIMICtable.loc[io==5, 'input_4hourly'].median()]
ma2 = [MIMICtable.loc[vc==1, 'max_dose_vaso'].median(), MIMICtable.loc[vc==2, 'max_dose_vaso'].median(), MIMICtable.loc[vc==3, 'max_dose_vaso'].median(), MIMICtable.loc[vc==4, 'max_dose_vaso'].median(), MIMICtable.loc[vc==5, 'max_dose_vaso'].median()]
med = pd.DataFrame(data={'IV':io, 'VC': vc})
med = med.astype({'IV': 'int32', 'VC': 'int32'})
uniqueValues = med.drop_duplicates().reset_index(drop=True)
uniqueValueDoses = pd.DataFrame()
for index, row in uniqueValues.iterrows():
    uniqueValueDoses.at[index, 'IV'], uniqueValueDoses.at[index, 'VC'] = ma1[row['IV']-1], ma2[row['VC']-1]

actionbloc = pd.DataFrame()
for index, row in med.iterrows():
    actionbloc.at[index, 'action_bloc'] = uniqueValues.loc[(uniqueValues['IV'] == row['IV']) & (uniqueValues['VC'] == row['VC'])].index.values[0]+1
actionbloc = actionbloc.astype({'action_bloc':'int32'})

logging.info('Action bins created')

## Fitting models

In [9]:
from sklearn.linear_model import LogisticRegression
icuuniqueids = MIMICtable['icustay_id'].unique()
modelsDf = pd.DataFrame()

logging.info('Fitting models')

for model in range(nr_reps):
    logging.info('Model: ' + str(model))
    grp = np.floor(5*np.random.rand(len(icuuniqueids))+1)
    crossval = 1
    trainidx = icuuniqueids[grp != crossval]
    testidx = icuuniqueids[grp == crossval]
    X = MIMICraw.loc[MIMICtable['icustay_id'].isin(trainidx)]
    Xtestmimic = MIMICraw[MIMICtable['icustay_id'].isin(testidx)]
    blocs = MIMICtable.loc[MIMICtable['icustay_id'].isin(trainidx), 'bloc']
    bloctestmimic = MIMICtable.loc[MIMICtable['icustay_id'].isin(testidx), 'bloc']
    ptid = MIMICtable.loc[MIMICtable['icustay_id'].isin(trainidx), 'icustay_id']
    ptidtestmimic = MIMICtable.loc[MIMICtable['icustay_id'].isin(testidx), 'icustay_id']
    Y = actionbloc.loc[MIMICtable['icustay_id'].isin(trainidx), 'action_bloc']
    Ytest = actionbloc.loc[MIMICtable['icustay_id'].isin(testidx), 'action_bloc']
    clf = LogisticRegression(random_state=0, max_iter=100000).fit(X, Y)
    acc_train = clf.score(X, Y)
    acc_test = clf.score(Xtestmimic, Ytest)
    modelsDf = modelsDf.append({'model': model, 'regressor': clf, 'acc_train': acc_train, 'acc_test': acc_test}, ignore_index=True)
    
logging.info('Model fitting done!')

In [10]:
modelsDf.head()

Unnamed: 0,acc_test,acc_train,model,regressor
0,0.242857,0.861963,0.0,"LogisticRegression(max_iter=100000, random_sta..."
1,0.337349,0.744125,1.0,"LogisticRegression(max_iter=100000, random_sta..."
2,0.190789,0.834395,2.0,"LogisticRegression(max_iter=100000, random_sta..."
3,0.360656,0.80814,3.0,"LogisticRegression(max_iter=100000, random_sta..."
4,0.306452,0.839181,4.0,"LogisticRegression(max_iter=100000, random_sta..."
