In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from scipy.stats import chi2_contingency

##  Define useful functions (Copied from association.ipynb)

In [2]:
def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print "Chi-square test of independence with Yates' continuity correction"
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print "X-squared = %s" % chi2
    print "dof       = %s" % dof
    print "p-value   = %s" % p

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print result.summary()

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))

In [3]:
true_negatives = pd.read_csv('true_negatives.csv') \
                   .set_index(['subject_id', 'hadm_id', 'icustay_id'], verify_integrity=True)
display(true_negatives.describe())
true_negatives = true_negatives.drop(columns=true_negatives.columns)
true_negatives['predicted_label'] = 0

false_positives = pd.read_csv('false_positives.csv') \
                   .set_index(['subject_id', 'hadm_id', 'icustay_id'], verify_integrity=True)
display(false_positives.describe())
false_positives = false_positives.drop(columns=false_positives.columns)
false_positives['predicted_label'] = 1

## v Copied from association.ipynb v ##

# ICU mortality
df = pd.read_csv('../../data_collection/icu_mort.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['intime', 'outtime', 'in_icu_mort'])
df = df.query('intime.notnull() & outtime.notnull()')
df['LOS'] = (pd.to_datetime(df['outtime']) - pd.to_datetime(df['intime'])).dt.days
df_mort = df.filter(['LOS', 'in_icu_mort'])  # need LOS for MODS calculation

# MODS on day 7
sofa_subscores = ['cardiovascular', 'cns', 'coagulation', 'liver', 'renal', 'respiration']
df = pd.read_csv('../../data_collection/sofa_pan.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['day'] + sofa_subscores)
df = df.query('day == 7')
df['od_sum'] = sum((df[subscore] > 1).astype(int) for subscore in sofa_subscores)
df = df.filter(['od_sum'])
df = df.join(df_mort, how='inner')
df['mods'] = ((df['od_sum'] > 1) | ((df['LOS'] < 7) & (df['in_icu_mort'] == 1))).astype(int)
df_mods = df.filter(['mods'])

df_mort = df_mort.filter(['in_icu_mort'])  # toss LOS since we no longer need it

# New AKI
df = pd.read_csv('../../data_collection/new_aki.csv', index_col=0)
df = df.set_index(['icustay_id'], verify_integrity=True)
df = df.filter(['dif'])
df['new_aki'] = (-df['dif']).astype(bool).astype(int)  # convert to binary flag
df_aki = df.filter(['new_aki'])

Unnamed: 0,day_1_chl,age,gender,chloride_input_meq,fluid_net_input_ml,EPAP,IPAP,LPM,MeanAirwayPressure,heartrate_max,...,paralysis,other_neurological,chronic_pulmonary,diabetes_complicated,renal_failure,solid_tumor,obesity,fluid_electrolyte,drug_abuse,depression
count,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,...,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0,15170.0
mean,101.661767,62.772373,0.393144,161.510062,1153.785993,0.021358,0.017271,0.742123,0.282334,84.683784,...,0.00646,0.041595,0.101055,0.055438,0.094001,0.009097,0.0265,0.130784,0.019842,0.051945
std,4.127728,17.306544,0.488465,171.805348,2856.617263,0.144579,0.130283,0.437481,0.45015,11.30237,...,0.080117,0.199669,0.301411,0.228841,0.29184,0.094946,0.160621,0.337176,0.139461,0.221923
min,67.0,18.008219,0.0,0.0,-25400.0,0.0,0.0,0.0,0.0,41.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,100.0,51.396576,0.0,27.208685,-440.508332,0.0,0.0,0.0,0.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,102.0,63.805479,0.0,119.602238,685.000013,0.0,0.0,1.0,0.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,105.0,76.536301,1.0,243.574441,2229.874996,0.0,0.0,1.0,1.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,109.0,91.4,1.0,1805.671099,29647.746105,1.0,1.0,1.0,1.0,175.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,day_1_chl,age,gender,chloride_input_meq,fluid_net_input_ml,EPAP,IPAP,LPM,MeanAirwayPressure,heartrate_max,...,paralysis,other_neurological,chronic_pulmonary,diabetes_complicated,renal_failure,solid_tumor,obesity,fluid_electrolyte,drug_abuse,depression
count,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,...,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0,6798.0
mean,107.369815,66.050485,0.494557,310.68741,2571.24906,0.005149,0.003383,0.562224,0.534128,85.314357,...,0.008238,0.027361,0.058841,0.011474,0.026478,0.010591,0.012798,0.0812,0.009415,0.024419
std,1.465823,16.748693,0.500007,271.026305,3649.160309,0.071574,0.058072,0.49615,0.498871,12.440171,...,0.090394,0.163145,0.235344,0.106508,0.160565,0.102375,0.11241,0.273163,0.096578,0.154357
min,99.0,18.145205,0.0,0.0,-26887.633361,0.0,0.0,0.0,0.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,106.0,55.241781,0.0,130.8,201.9576,0.0,0.0,0.0,0.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,108.0,68.152054,0.0,257.722285,1788.652699,0.0,0.0,1.0,1.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,109.0,79.178082,1.0,410.844656,4147.981501,0.0,0.0,1.0,1.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,109.0,91.4,1.0,2509.5,28450.143305,1.0,1.0,1.0,1.0,216.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Predicted Label and ICU Mortality

In [4]:
df_1 = false_positives.join(df_mort, how='inner')
df_2 = true_negatives.join(df_mort, how='inner')

df_combined = df_1.append(df_2)

In [5]:
run_chi2(df_combined['predicted_label'], df_combined['in_icu_mort'])

Chi-square test of independence with Yates' continuity correction


in_icu_mort,0,1
predicted_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14244,926
1,6311,487


X-squared = 8.584295472559893
dof       = 1
p-value   = 0.0033907455357023854


In [6]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:            in_icu_mort   No. Observations:                21968
Model:                            GLM   Df Residuals:                    21966
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5239.3
Date:                Tue, 03 Sep 2019   Deviance:                       10479.
Time:                        08:36:09   Pearson chi2:                 2.20e+04
No. Iterations:                     6   Covariance Type:             nonrobust
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -2.7332      0.034    -80.594      0.000      -2.800      -2.667
predicted_label     0.1714      0.058

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,Odds Ratio,2.5%,97.5%
predicted_label,1.187003,1.05949,1.329862


# Predicted Label and MODS on Day 7

In [7]:
df_1 = false_positives.join(df_mods, how='left')
df_2 = true_negatives.join(df_mods, how='left')

df_combined = df_1.append(df_2)
df_combined = df_combined.fillna({'mods': 0})

In [8]:
run_chi2(df_combined['predicted_label'], df_combined['mods'])

Chi-square test of independence with Yates' continuity correction


mods,0.0,1.0
predicted_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14654,516
1,6560,238


X-squared = 0.11200678022690563
dof       = 1
p-value   = 0.7378709106133279


In [9]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   mods   No. Observations:                21968
Model:                            GLM   Df Residuals:                    21966
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3283.3
Date:                Tue, 03 Sep 2019   Deviance:                       6566.6
Time:                        08:36:09   Pearson chi2:                 2.20e+04
No. Iterations:                     6   Covariance Type:             nonrobust
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -3.3464      0.045    -74.711      0.000      -3.434      -3.259
predicted_label     0.0299      0.080

Unnamed: 0,Odds Ratio,2.5%,97.5%
predicted_label,1.030338,0.881241,1.20466


# Predicted Label and New AKI on Day 7

In [10]:
df_1 = false_positives.join(df_aki, how='inner')
df_2 = true_negatives.join(df_aki, how='inner')

df_combined = df_1.append(df_2)

In [11]:
run_chi2(df_combined['predicted_label'], df_combined['new_aki'])

Chi-square test of independence with Yates' continuity correction


new_aki,0,1
predicted_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10290,4586
1,4198,2498


X-squared = 87.55684876072894
dof       = 1
p-value   = 8.189190236249293e-21


In [12]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                new_aki   No. Observations:                21572
Model:                            GLM   Df Residuals:                    21570
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13612.
Date:                Tue, 03 Sep 2019   Deviance:                       27225.
Time:                        08:36:09   Pearson chi2:                 2.16e+04
No. Iterations:                     5   Covariance Type:             nonrobust
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.8082      0.018    -45.518      0.000      -0.843      -0.773
predicted_label     0.2890      0.031

Unnamed: 0,Odds Ratio,2.5%,97.5%
predicted_label,1.335154,1.256735,1.418466
