This notebook generates the final ROC and discrimination plots for the IMC models

In [None]:
%matplotlib inline
%load_ext rpy2.ipython

import os
import glob
from os import listdir, makedirs
from os.path import isfile, join, exists

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from scipy import interp
from scipy.interpolate import interp1d

In [None]:
class LogRegModel(object):
    def __init__(self, fn, model_format='MICE'):
        self.intercept = 0
        self.names = []        
        self.terms = []
        if model_format == 'MICE':
            self.loadTermsMICE(fn)
        elif model_format == 'GLM':
            self.loadTermsGLM(fn)
            
    def setIntercept(self, b0):
        self.intercept = b0

    def addTerm(self, t):
        self.terms += [t]
        self.names += [t.name]        

    def linfeat(self, x):
        zmat = []
        for i in range(0, len(x)):
            xrow = x[i]
            zrow = [1.0]
            for j in range(0, len(self.terms)):
                t = self.terms[j]
                zrow += t.linearFeatures(xrow[j])
            zmat += [zrow]
        return zmat

    def lincoeff(self):
        coeff = [self.intercept]
        for t in self.terms:
            coeff += t.coeffs
        return coeff    
                
    def sigmoid(self, v):
        return 1.0 / (1.0 + np.exp(-v))
            
    def predict(self, x):
        z = self.linfeat(x)
        theta = self.lincoeff()
        prob = []
        n = len(z)
        for i in range(0, n):            
            p = self.sigmoid(np.dot(z[i], theta))
            prob += [p]
        return np.array(prob)

    def loatVarTypes(self, data_fn, dict_fn):
        var = []
        vtyp= []
        with open(data_fn) as f:
            var = f.readlines()[0].split(',')
        with open(dict_fn) as f:
            for line in f.readlines():
                line = line.strip()
                if not line: continue
                _, t = line.split(',')[0:2]
                vtyp += [t]
        for t in self.terms:
            pos = var.index(t.name)
            t.vtyp = vtyp[pos]
            
    def saveOddRatios(self, x, fn):
        theta = self.lincoeff()
        scale = [1.0] * len(theta)

        t = 0
        ts = 1
        for term in self.terms:
            vrang = term.varRanges(x[:,t]) 
            for i in range(0, len(vrang)):
                scale[ts] = vrang[i]
                if scale[ts] < 1: scale[ts] = 1.0 / scale[ts]
                ts = ts + 1                
            t = t + 1

        theta *= np.array(scale)
        odds = np.exp(theta)
        ts = 1
        with open(fn, 'w') as f:                
            for term in self.terms:
                vnam = term.varNames()
                for i in range(0, len(vnam)):
                    f.write(vnam[i] + ' ' + str(odds[ts]) + '\n')
                    ts = ts + 1
                    
    def getFormula(self, digits):
        formula = str(round(self.intercept, digits))
        for term in self.terms:
            formula = formula + term.getFormula(digits)
        return formula
        
    def saveRanges(self, x, fn):
        nrows = len(x)
        nvars = len(self.terms)
        values = np.zeros((nrows, nvars))
        for i in range(0, nrows):
            xrow = x[i]
            vrow = values[i]
            for t in range(0, len(self.terms)):
                term = self.terms[t]
                vrow[t] = term.value(xrow[t])

        with open(fn, 'w') as f:                
            for t in range(0, len(self.terms)):
                term = self.terms[t]
                mint = min(values[:,t])
                maxt = max(values[:,t])
                f.write(term.name + ' ' + str(mint) + ' ' + str(maxt) + '\n')            

    def saveRCSTerms(self, x, d):
        for t in range(0, len(self.terms)):            
            term = self.terms[t]
            if not term.isRCS: continue
            yvalues = []
            xmin = x[:,t].min()
            xmax = x[:,t].max()                
            xvalues = np.linspace(xmin, xmax, 100)
            for xt in xvalues:
                y = term.value(xt)
                yvalues += [y]
            fig, ax = plt.subplots()
            plt.plot(xvalues, yvalues)
            plt.xlabel(term.name, labelpad=20)
            plt.title('RCS term for ' + term.name)
            fig.savefig(os.path.join(d, 'rcs_' + term.name + '.pdf'))
                
    def loadTermsMICE(self, fn):
        rcsCoeffs = None;
        lines = []
        with open(fn) as ifn:    
            lines = ifn.readlines()

        pos = lines[0].index('est') + 2

        n = 1;
        while n < len(lines):
            line = lines[n]     
            n += 1
            
            s = line[0:pos].strip()
            
            v = s.split()
            if line[0] == ' ' or len(v) == 1: break
            valueStr = v[-1]
            value = float(valueStr)

            pos0 = s.index(valueStr)
            var = s[0:pos0].strip()

            if 'rcs' in var and var.index('rcs') == 0:
                pos1 = var.rfind(')')
                rcsString = var[4:pos1]
                pieces = rcsString.split('c')
                part1 = pieces[0].split(',')
                varName = part1[0].strip()
                rcsOrder = int(part1[1].strip())
                knotStr = pieces[1].replace("(", "").replace(")", "").split(",")
                rcsKnots = [float(k) for k in knotStr]
                coeffOrder = len(var) - len(var.replace("'", ""))
                
                if coeffOrder == 0:
                    rcsCoeffs = [0.0] * (rcsOrder - 1);
                if rcsCoeffs: 
                    rcsCoeffs[coeffOrder] = value;

                if coeffOrder == rcsOrder - 2:
                    term = RCSTerm(varName, rcsOrder, rcsCoeffs, rcsKnots)
                    self.addTerm(term)              
            else:
                if var == '(Intercept)':
                    self.setIntercept(value);
                else:
                    term = LinearTerm(var, value)
                    self.addTerm(term)

    def loadTermsGLM(self, fn):               
        rcsCoeffs = None;
        lines = []
        with open(fn) as ifn:    
            lines = ifn.readlines()

        reading = False
        n = 1;
        while n < len(lines):
            line = lines[n]
            n += 1

            if '(Intercept)' in line: 
                reading = True
                val = line.split()[1]
                pos = line.index(val) + len(val)
                
                # This breaks easily if file is not properly formatted:
                #pos = line.index('Estimate') + 8
                #continue
            
            if not reading: continue
            
            s = line[0:pos].strip()
            
            v = s.split()
            if line[0] == ' ' or len(v) == 1 or v[0] == '---': break   
            valueStr = v[-1]
            value = float(valueStr)

            pos0 = s.index(valueStr)
            var = s[0:pos0].strip()

            if 'rcs' in var and var.index('rcs') == 0:
                pos1 = var.rfind(')')
                rcsString = var[4:pos1]
                pieces = rcsString.split('c')
                part1 = pieces[0].split(',')
                varName = part1[0].strip()
                rcsOrder = int(part1[1].strip())
                knotStr = pieces[1].replace("(", "").replace(")", "").split(",")
                rcsKnots = [float(k) for k in knotStr]
                coeffOrder = len(var) - len(var.replace("'", ""))
                
                if coeffOrder == 0:
                    rcsCoeffs = [0.0] * (rcsOrder - 1);
                if rcsCoeffs: 
                    rcsCoeffs[coeffOrder] = value;

                if coeffOrder == rcsOrder - 2:
                    term = RCSTerm(varName, rcsOrder, rcsCoeffs, rcsKnots)
                    self.addTerm(term)              
            else:
                if var == '(Intercept)':
                    self.setIntercept(value);
                else:
                    term = LinearTerm(var, value)
                    self.addTerm(term)
                    
class ModelTerm(object):
    def __init__(self, name):
        self.isRCS = False
        self.name = name
        self.vtyp = 'float'
        self.coeffs = []
    def linearFeatures(self, x):
        return [0.0] * len(self.coeffs)
    def varRanges(self, x):
        # Scale coefficients by IQR (in floating-point variables) or
        # closest power-of-ten for integer variables.        
        if self.vtyp == 'category': 
            return [1]
        elif self.vtyp == 'int':
            n = np.floor(np.log10(max(x)))
            return [np.power(10, n)]
        elif self.vtyp == 'float':                                
            return [np.percentile(x, 75) - np.percentile(x, 25)]
    def getFormula(self, digits):
        return ''
    def varNames(self):
        return [self.name]
    def value(self, x): 
        return np.dot(self.coeffs, self.linearFeatures(x))
    
class LinearTerm(ModelTerm):
    def __init__(self, name, c):
        ModelTerm.__init__(self, name)
        self.coeffs = [c]

    def linearFeatures(self, x):
        return [x]

    def getFormula(self, digits):
        c = self.coeffs[0]
        sign = ' + ' if 0 < c else ' - '
        return sign + str(round(abs(c), digits)) + ' ' + self.name
    
    def __str__(self):
        res = "Linear term for " + self.name + "\n"
        res += "  Coefficient: " + str(self.coeffs[0])
        return res

class RCSTerm(ModelTerm):
    def __init__(self, name, k, c, kn):
        ModelTerm.__init__(self, name)
        self.isRCS = True        
        self.order = k
        self.coeffs = list(c)
        self.knots = list(kn)

    def cubic(self, u):
        t = np.maximum(0, u)
        return t * t * t
    
    def rcs(self, x, term):
        k = len(self.knots) - 1
        j = term - 1
        t = self.knots
        c = (t[k] - t[0]) * (t[k] - t[0])
        value = +self.cubic(x - t[j]) \
                -self.cubic(x - t[k - 1]) * (t[k] - t[j])/(t[k] - t[k-1]) \
                +self.cubic(x - t[k]) * (t[k - 1] - t[j])/(t[k] - t[k-1]) 
        return value / c
    
    def rcsform(self, term, digits):
        k = len(self.knots) - 1
        j = term - 1
        t = self.knots
        c = (t[k] - t[0]) * (t[k] - t[0])
          
        c0 = self.coeffs[term] / c
        sign0 = ' + ' if 0 < c0 else ' - '
        s = sign0 + str(round(abs(c0), digits[0])) + ' max(%s - ' + str(round(t[j], 3)) + ', 0)^3' 
    
        c1 = self.coeffs[term] * (t[k] - t[j])/(c * (t[k] - t[k-1]))    
        sign1 = ' - ' if 0 < c1 else ' + '
        s += sign1 + str(round(abs(c1), digits[1])) + ' max(%s - ' + str(round(t[k - 1], 3)) + ', 0)^3' 
    
        c2 = self.coeffs[term] * (t[k - 1] - t[j])/(c * (t[k] - t[k-1]))
        sign2 = ' + ' if 0 < c2 else ' - '        
        s += sign2 + str(round(c2, digits[2])) + ' max(%s - ' + str(round(t[k], 3)) + ', 0)^3' 
    
        return s

    def linearFeatures(self, x):
        feat = [0.0] * (self.order - 1)
        feat[0] = x
        for t in range(1, self.order - 1):
            feat[t] = self.rcs(x, t)
        return feat           

    def varRanges(self, x):
        rang = [0.0] * (self.order - 1)
        rang[0] = np.percentile(x, 75) - np.percentile(x, 25)
        for i in range(1, self.order - 1):
            y = self.rcs(x, i)
            rang[i] = np.percentile(y, 75) - np.percentile(y, 25)            
        return rang
    
    def varNames(self):
        nam = [''] * (self.order - 1)
        nam[0] = self.name
        for i in range(1, self.order - 1):
            nam[i] = self.name + ("'" * i)
        return nam
    
    def getFormula(self, digits):        
        c = self.coeffs[0]
        sign = ' + ' if 0 < c else ' - '
        s = sign + str(round(abs(c), digits)) + ' ' + self.name
        for i in range(1, self.order - 1):
            s = s + self.rcsform(i, [digits] * 3) % (self.name, self.name, self.name)
        return s
    
    def __str__(self):
        res = "RCS term of order " + str(self.order) + " for " + self.name + "\n"
        res += "  Coefficients:";
        for i in range(0, len(self.coeffs)):
            res += " " + str(self.coeffs[i])
        res += "\n"
        res += "  Knots:"
        for i in range(0, len(self.knots)):
            res += " " + str(self.knots[i])
        return res

In [None]:
def calibration_table(outcome, prob, n_bins=10):
    """Calibration measurement for a set of predictions.
    When predicting events at a given probability, how far is frequency
    of positive outcomes from that probability?
    NOTE: Lower scores are better
    prob: array_like, float
        Probability estimates for a set of events
    outcome: array_like, bool
        If event predicted occurred
    n_bins: int
        Number of judgement categories to prefrom calculation over.
        Prediction are binned based on probability, since "discrete" 
        probabilities aren't required. 
    """
    prob = np.array(prob)
    outcome = np.array(outcome)

    c = 0.0
    # Construct bins
    judgement_bins = np.arange(n_bins + 1.0) / n_bins
    # Which bin is each prediction in?
    bin_num = np.digitize(prob, judgement_bins)

    counts = []
    true_prob = []
    pred_prob = []
    for j_bin in np.arange(n_bins + 1):
        # Is event in bin
        in_bin = bin_num == j_bin
#         # Predicted probability taken as average of preds in bin        
        predicted_prob = np.mean(prob[in_bin])
#         # How often did events in this bin actually happen?
        true_bin_prob = np.mean(outcome[in_bin])
        counts.append(np.sum(0 <= prob[in_bin]))
        true_prob.append(true_bin_prob) 
        pred_prob.append(predicted_prob)
    
    cal_table = pd.DataFrame({'pred_prob':pd.Series(np.array(pred_prob)), 
                              'count':pd.Series(np.array(counts)),
                              'true_prob':pd.Series(np.array(true_prob))}, 
                              columns=['pred_prob', 'count', 'true_prob'])
    cal_table.dropna(inplace=True)
    return cal_table 

In [None]:
imc_data_file = '../data/data.csv'
kenema_data_file = '../data/kenema/data.csv'

imc_data = pd.read_csv(imc_data_file, na_values="\\N")
kenema_data = pd.read_csv(kenema_data_file, na_values="\\N")

test_data_folder = '../data/kenema/test'

min_ct = imc_data['cycletime'].min()
max_ct = imc_data['cycletime'].max()
min_log_pcr = kenema_data['PCR'].min()
max_log_pcr = kenema_data['PCR'].max()
print min_ct, max_log_pcr
print max_ct, min_log_pcr
b = (max_log_pcr - min_log_pcr) / (max_ct - min_ct)
a = min_log_pcr + b * max_ct
vl2ct_c1 = -1/b
vl2ct_c0 = +a/b
print 3*b
print vl2ct_c1, vl2ct_c0

In [None]:
models = ['min', 'full']
alias = {'min':'minimal', 'full':'full'}
sns.set_style("white")
pal = sns.color_palette()
colors = {'min':pal[0], 'full':pal[1]}

In [None]:
# ROC plots from bootstrap data

fig, ax = plt.subplots()
plt.xlim([-0.2, 1.1])
plt.ylim([-0.1, 1.1])
plt.plot([0, 1], [0, 1], '-', c='grey', linewidth=0.5)
plt.xlabel('1 - Specificity', labelpad=15)
plt.ylabel('Sensitivity', labelpad=15)

for model_name in models:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
    variables = ['Disposition'] + model.names
   
    boot_folder = os.path.join(model_name, 'boot')
    imp_folder = os.path.join(model_name, 'imp')
    
    data_files = glob.glob(imp_folder + '/imputation-*.csv')
    imp_fpr = []
    imp_tpr = []
    for fn in data_files:
        dat = pd.read_csv(fn, na_values="\\N")[variables]
        val = dat[dat.columns[1:]].values

        pos0 = fn.index("imputation-") + 11
        pos1 = fn.index(".csv")
        idx = fn[pos0:pos1]
    
        index_files = glob.glob(boot_folder + '/index-' + idx + '*.txt')
        model_files = glob.glob(boot_folder + '/model-' + idx + '*.txt')    
    
        ytrue = []
        probs = []
        ypred = []
        nboot = len(index_files)
        for b in range(0, nboot):
            rows = []
            with open(index_files[b]) as ifile:
                lines = ifile.readlines()
                for line in lines:
                    pieces = line.split()[1:]
                    rows += [int(i) - 1 for i in pieces]
            
            ytrue += [int(v) for v in dat[dat.columns[0]].values[rows]]
            x = val[rows,:]

            model = LogRegModel(model_files[b], model_format='GLM')
            pboot = model.predict(x)
            probs += list(pboot)
            
        fpr, tpr, thresholds = roc_curve(ytrue, probs) 
        imp_fpr += [fpr]
        imp_tpr += [tpr]    
        
    all_fpr = np.unique(np.concatenate(imp_fpr))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(0, len(imp_fpr)):
        mean_tpr += interp(all_fpr, imp_fpr[i], imp_tpr[i])
    mean_tpr /= len(imp_fpr)

    plt.plot(all_fpr, mean_tpr, color=colors[model_name], label=alias[model_name])

plt.legend(loc='lower right')

fig.savefig('roc-boostrap.pdf')

In [None]:
# Calibration plots from bootstrap data

models = ['min', 'full']

fig, ax = plt.subplots()
plt.plot([0.05, 0.95], [0.05, 0.95], '-', c='grey', linewidth=0.5, zorder=1)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('Predicted Risk', labelpad=15)
plt.ylabel('Observed Risk', labelpad=15)

for model_name in models:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
    variables = ['Disposition'] + model.names
   
    boot_folder = os.path.join(model_name, 'boot')
    imp_folder = os.path.join(model_name, 'imp')
    
    data_files = glob.glob(imp_folder + '/imputation-*.csv')
    imp_ppr = []
    imp_tpr = []
    for fn in data_files:
        dat = pd.read_csv(fn, na_values="\\N")[variables]
        val = dat[dat.columns[1:]].values

        pos0 = fn.index("imputation-") + 11
        pos1 = fn.index(".csv")
        idx = fn[pos0:pos1]
    
        index_files = glob.glob(boot_folder + '/index-' + idx + '*.txt')
        model_files = glob.glob(boot_folder + '/model-' + idx + '*.txt')    
    
        ytrue = []
        probs = []
        ypred = []
        nboot = len(index_files)
        for b in range(0, nboot):
            rows = []
            with open(index_files[b]) as ifile:
                lines = ifile.readlines()
                for line in lines:
                    pieces = line.split()[1:]
                    rows += [int(i) - 1 for i in pieces]
            
            ytrue += [int(v) for v in dat[dat.columns[0]].values[rows]]
            x = val[rows,:]

            model = LogRegModel(model_files[b], model_format='GLM')
            pboot = model.predict(x)
            probs += list(pboot)
            
        cal_table = calibration_table(ytrue, probs, 10)
        x = cal_table['pred_prob']
        y = cal_table['true_prob']        
        imp_ppr += [x]
        imp_tpr += [y]   
        
    all_ppr = np.unique(np.concatenate(imp_ppr))
    mean_tpr = np.zeros_like(all_ppr)
    for i in range(0, len(imp_ppr)):
        mean_tpr += interp(all_ppr, imp_ppr[i], imp_tpr[i])
    mean_tpr /= len(imp_ppr)

    xnew = np.linspace(min(all_ppr), max(all_ppr), num=2 * len(all_ppr), endpoint=True)    
    f = interp1d(all_ppr, mean_tpr, kind='cubic')    
    plt.plot(xnew, f(xnew), color=colors[model_name], label=alias[model_name])

plt.legend(loc='lower right')    
    
fig.savefig('cal-boostrap.pdf')

In [None]:
# ROC plots from complete KGH data

fig, ax = plt.subplots()
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.plot([0, 1], [0, 1], '-', c='grey', linewidth=0.5)
plt.xlabel('1 - Specificity', labelpad=15)
plt.ylabel('Sensitivity', labelpad=15)

for model_name in models[0:2]:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
#     variables = ['Disposition'] + model.names
    
    if model_name == 'pres-min':
        src_variables = ['OUT', 'PCR', 'AGE']
        variables = ['OUT', 'CT', 'AGE']
    else:
        src_variables = ['OUT', 'PCR', 'AGE', 'DIARR', 'WEAK', 'JAUN', 'BNONE', 'TEMP', 'HEADCH', 'VOMIT', 'PABD']
        variables = ['OUT', 'CT', 'AGE', 'TEMP', 'HEADCH', 'BLEED', 'DIARR', 'JAUN', 'VOMIT', 'PABD', 'WEAK']
    
    test_data = kenema_data[kenema_data['DIAG'] == 1][src_variables]
    test_data['CT'] = vl2ct_c1 * test_data['PCR'] + vl2ct_c0
    if 'BLEED' in variables and 'BNONE' in src_variables:
        test_data['BLEED'] = 1 - test_data['BNONE']
    if 'JAUN' in variables:
        test_data['JAUN'] = 0 # all the non-missing values are 0, so MICE won't impute it

    test_data = test_data[variables]
    complete_data = test_data.dropna()    
    x = complete_data[complete_data.columns[1:]].values
    ytrue = [int(v) for v in complete_data[complete_data.columns[0]].values]
    probs = model.predict(x)
    fpr, tpr, thresholds = roc_curve(ytrue, probs)
    plt.plot(fpr, tpr, color=colors[model_name], label=alias[model_name])
    
plt.legend(loc='lower right')

fig.savefig('roc-kgh-comp.pdf')

In [None]:
# Calibration plots from complete KGH data

fig, ax = plt.subplots()
plt.plot([0.05, 0.95], [0.05, 0.95], '-', c='grey', linewidth=0.5, zorder=1)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('Predicted Risk', labelpad=15)
plt.ylabel('Observed Risk', labelpad=15)

for model_name in models[0:2]:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
#     variables = ['Disposition'] + model.names
    
    if model_name == 'pres-min':
        src_variables = ['OUT', 'PCR', 'AGE']
        variables = ['OUT', 'CT', 'AGE']
    else:
        src_variables = ['OUT', 'PCR', 'AGE', 'DIARR', 'WEAK', 'JAUN', 'BNONE', 'TEMP', 'HEADCH', 'VOMIT', 'PABD']
        variables = ['OUT', 'CT', 'AGE', 'TEMP', 'HEADCH', 'BLEED', 'DIARR', 'JAUN', 'VOMIT', 'PABD', 'WEAK']
    
    test_data = kenema_data[kenema_data['DIAG'] == 1][src_variables]
    test_data['CT'] = vl2ct_c1 * test_data['PCR'] + vl2ct_c0
    if 'BLEED' in variables and 'BNONE' in src_variables:
        test_data['BLEED'] = 1 - test_data['BNONE']
    if 'JAUN' in variables:
        test_data['JAUN'] = 0 # all the non-missing values are 0, so MICE won't impute it

    test_data = test_data[variables]
    complete_data = test_data.dropna()    
    x = complete_data[complete_data.columns[1:]].values
    ytrue = [int(v) for v in complete_data[complete_data.columns[0]].values]
    probs = model.predict(x)
    cal_table = calibration_table(ytrue, probs, 10)
    x = cal_table['pred_prob']
    y = cal_table['true_prob']    
    plt.plot(x, y, color=colors[model_name], label=alias[model_name])
    
plt.legend(loc='lower right')

fig.savefig('cal-kgh-comp.pdf')

In [None]:
# ROC plots from imputed KGH data

fig, ax = plt.subplots()
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.plot([0, 1], [0, 1], '-', c='grey', linewidth=0.5)
plt.xlabel('1 - Specificity', labelpad=15)
plt.ylabel('Sensitivity', labelpad=15)

for model_name in models[0:2]:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
#     variables = ['Disposition'] + model.names
    
    if model_name == 'pres-min':
        src_variables = ['OUT', 'PCR', 'AGE']
        variables = ['OUT', 'CT', 'AGE']
    else:
        src_variables = ['OUT', 'PCR', 'AGE', 'DIARR', 'WEAK', 'JAUN', 'BNONE', 'TEMP', 'HEADCH', 'VOMIT', 'PABD']
        variables = ['OUT', 'CT', 'AGE', 'TEMP', 'HEADCH', 'BLEED', 'DIARR', 'JAUN', 'VOMIT', 'PABD', 'WEAK']
    
    imp_data_folder = os.path.join(test_data_folder, model_name)    
    imp_data_files = [join(imp_data_folder, f) for f in listdir(imp_data_folder) if isfile(join(imp_data_folder, f))]    
    
    test_data = kenema_data[kenema_data['DIAG'] == 1][src_variables]
    test_data['CT'] = vl2ct_c1 * test_data['PCR'] + vl2ct_c0
    if 'BLEED' in variables and 'BNONE' in src_variables:
        test_data['BLEED'] = 1 - test_data['BNONE']
    if 'JAUN' in variables:
        test_data['JAUN'] = 0 # all the non-missing values are 0, so MICE won't impute it

    imp_fpr = []
    imp_tpr = []

    for fn in imp_data_files:
        data = pd.read_csv(fn)    
        x = data[data.columns[1:]].values
        ytrue = [int(v) for v in data[data.columns[0]].values]
        probs = list(model.predict(x))    
        fpr, tpr, thresholds = roc_curve(ytrue, probs) 
        imp_fpr += [fpr]
        imp_tpr += [tpr]
    
    all_fpr = np.unique(np.concatenate(imp_fpr))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(0, len(imp_fpr)):
        mean_tpr += interp(all_fpr, imp_fpr[i], imp_tpr[i])
    mean_tpr /= len(imp_fpr)
    plt.plot(all_fpr, mean_tpr, color=colors[model_name], label=alias[model_name])
    
plt.legend(loc='lower right')

fig.savefig('roc-kgh-imp.pdf')

In [None]:
# Calibration plots from imputed KGH data

fig, ax = plt.subplots()
plt.plot([0.05, 0.95], [0.05, 0.95], '-', c='grey', linewidth=0.5, zorder=1)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('Predicted Risk', labelpad=15)
plt.ylabel('Observed Risk', labelpad=15)

for model_name in models[0:2]:
    print model_name
    model_params = os.path.join(model_name, 'mice.txt')    
    model = LogRegModel(model_params)
#     variables = ['Disposition'] + model.names
    
    if model_name == 'pres-min':
        src_variables = ['OUT', 'PCR', 'AGE']
        variables = ['OUT', 'CT', 'AGE']
    else:
        src_variables = ['OUT', 'PCR', 'AGE', 'DIARR', 'WEAK', 'JAUN', 'BNONE', 'TEMP', 'HEADCH', 'VOMIT', 'PABD']
        variables = ['OUT', 'CT', 'AGE', 'TEMP', 'HEADCH', 'BLEED', 'DIARR', 'JAUN', 'VOMIT', 'PABD', 'WEAK']
    
    imp_data_folder = os.path.join(test_data_folder, model_name)    
    imp_data_files = [join(imp_data_folder, f) for f in listdir(imp_data_folder) if isfile(join(imp_data_folder, f))]    
    
    test_data = kenema_data[kenema_data['DIAG'] == 1][src_variables]
    test_data['CT'] = vl2ct_c1 * test_data['PCR'] + vl2ct_c0
    if 'BLEED' in variables and 'BNONE' in src_variables:
        test_data['BLEED'] = 1 - test_data['BNONE']
    if 'JAUN' in variables:
        test_data['JAUN'] = 0 # all the non-missing values are 0, so MICE won't impute it

    imp_ppr = []
    imp_tpr = []

    for fn in imp_data_files:
        data = pd.read_csv(fn)    
        x = data[data.columns[1:]].values
        ytrue = [int(v) for v in data[data.columns[0]].values]
        probs = list(model.predict(x))    
        cal_table = calibration_table(ytrue, probs, 10)
        x = cal_table['pred_prob']
        y = cal_table['true_prob']
        imp_ppr += [x]
        imp_tpr += [y]
    
    all_ppr = np.unique(np.concatenate(imp_ppr))
    mean_tpr = np.zeros_like(all_ppr)
    for i in range(0, len(imp_ppr)):
        mean_tpr += interp(all_ppr, imp_ppr[i], imp_tpr[i])
    mean_tpr /= len(imp_ppr)
    plt.plot(all_ppr, mean_tpr, color=colors[model_name], label=alias[model_name])
    
plt.legend(loc='lower right')

fig.savefig('cal-kgh-imp.pdf')