# Analyze the best performing models of each type

In [None]:
# Load modules
import os
import sys
from time import time
from scipy import stats
import pandas as pd
import numpy as np
import plotnine
from plotnine import *
# Load the help functions
from support.acc_funs import fast_auc, fast_decomp
from support.support_funs import makeifnot, decomp_var

from scipy.stats import rankdata
from time import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set directories
dir_base = os.getcwd()
dir_NSQIP = os.path.join(dir_base,'..')
dir_output = os.path.join(dir_NSQIP, 'output')
dir_figures = os.path.join(dir_NSQIP, 'figures')
makeifnot(dir_figures)

di_model = {'logit':'Logistic-L2', 'rf':'RandomForest', 'xgb':'XGBoost'}
di_outcome = {'adv':'ADV', 'aki':'AKI', 'cns':'CNS',
              'nsi':'nSSIs', 'ssi':'SSIs', 'unplan':'UPLN'}
di_method = {'agg':'Aggregate', 'sub':'CPT-model'}

In [None]:
# # Dummy data to make sure fast approximations line up
# y = np.array([1,1,1,0,0,0,0,0])
# s = np.array([6,5,7,5,3,2,2,1])
# g = np.array(['a','a','b','a','b','b','b','c'])
# print(fast_decomp(y,s,g,ret_df=True))

## (1) Load in data

In [2]:
fn_output = pd.Series(os.listdir(dir_output))
fn_best = fn_output[fn_output.str.contains('.csv$') & fn_output.str.contains('^best')].reset_index(None,True)

fn_csv = 'df_best.csv'
check_csv = fn_csv in os.listdir(dir_output)
if not check_csv:
    print('Loading data via loop')
    holder = []
    cn_keep = ['model','outcome','test_year','cpt','y','preds']
    for fn in fn_best:
        print('Loading file: %s' % fn)
        path = os.path.join(dir_output, fn)
        tmp_df = pd.read_csv(path, usecols=cn_keep)  #, nrows=10
        tmp_df.rename(columns={'model':'method'},inplace=True)
        mdl = fn.split('.')[0].split('_')[-1]
        tmp_df.insert(0,'model',mdl)
        holder.append(tmp_df)
        del tmp_df
        #break
    df_nsqip = pd.concat(holder).reset_index(None, True)
    df_nsqip.outcome = df_nsqip.outcome.str.replace('agg_','')
    df_nsqip['version'] = df_nsqip.outcome.str.replace('[^0-9]','')
    df_nsqip.version = np.where(df_nsqip.version == '', '1', df_nsqip.version).astype(int)
    df_nsqip.outcome = df_nsqip.outcome.str.replace('[^a-z]','')
    del holder
    print('Writing to file')
    df_nsqip.to_csv(os.path.join(dir_output, fn_csv), index=False)
else:
    print('Loading large CSV file')
    df_nsqip = pd.read_csv(os.path.join(dir_output, fn_csv))
df_nsqip.head(2)

Loading large CSV file


Unnamed: 0,model,y,preds,cpt,test_year,outcome,method,version
0,xgb,0,0.000325,c24538,2016,nsi,agg,1
1,xgb,0,0.000952,c31541,2016,nsi,agg,1


In [None]:
df_nsqip.groupby(['model','method']).size()

## (2) Calculate within/between (all years and no years)

In [13]:
"""
FUNCTION TO TAKE IN RAW Y/PREDS SCORES AND DO THE DECOMPOSITION OVER ANY GROUP ORDER
"""
def write_fast_decomp(df, fn, cn, ret_df=False):
    check = fn in os.listdir(dir_output)
    if not check:
        print('Running fast_decomp for %s' % fn)
        stime = time()
        tmp = df.groupby(cn).apply(lambda x: 
                   fast_decomp(x.y.values, x.preds.values, x.cpt.values, ret_df=ret_df))
        print('Took %0.1f seconds to run decomp' % (time() - stime))
        tmp = tmp.reset_index().drop(columns='level_'+str(len(cn)))
        tmp.to_csv(os.path.join(dir_output, fn), index=False)
    else:
        print('Decomposition already exists, loading: %s' % fn)
        tmp = pd.read_csv(os.path.join(dir_output, fn_within_year))
    return tmp

"""
FUNCTION TO GENERATE FAST BOOTSTRAPS with fast_decomp:
i) the within AUROC is a weighted sum of the CPT AUROCs
ii) AFTER the CPT aurocs are generated, we can bootstrap these, rather than bootstrapping the data and then calculating
"""

# def bootstrap_within_decomp():

In [14]:
cn_gg1 = ['model','test_year','outcome','version','method']
# Decompose including the year
fn_within_year = 'df_within_year.csv'
df_within_year = write_fast_decomp(df=df_nsqip, fn=fn_within_year, cn=cn_gg1, ret_df=False)

# Decompose aggregating over years
cn_gg2 = ['model','outcome','version','method']
fn_within = 'df_within.csv'
df_within = write_fast_decomp(df=df_nsqip, fn=fn_within, cn=cn_gg2, ret_df=False)


Decomposition already exists, loading: df_within_year.csv
Running fast_decomp for df_within.csv
Took 49.0 seconds to run decomp


In [17]:
# USE AS THE BASIS FOR BOOTSTRAP WITHIN
stime = time()
qq = df_nsqip.groupby(cn_gg2).apply(lambda x: 
                   fast_decomp(x.y.values, x.preds.values, x.cpt.values, ret_df=True))
print('Took %0.1f seconds to run decomp' % (time() - stime))
qq = qq.reset_index().drop(columns='level_'+str(len(cn_gg2)))

stime = time()
for ii in range(100):
    qq.groupby(cn_gg2).sample(frac=1,replace=True).groupby(cn_gg2).apply(lambda x: np.sum(x.auc*x.n0n1)/np.sum(x.n0n1))
print('Took %0.1f seconds to run decomp' % (time() - stime))

Took 49.0 seconds to run decomp


In [None]:
# Subset to within
df_decomp_within = df_decomp.query('tt == "within" & method=="agg"').reset_index(None, True).drop(columns=['tt','method'])
# Find the best outcome/version
best_decomp_all = df_decomp_within.groupby(['outcome','version']).auc.mean().reset_index()
best_decomp_all = best_decomp_all.sort_values(['outcome','auc'],ascending=[True,False]).groupby('outcome').head(1)
best_decomp_all = best_decomp_all.drop(columns='auc').reset_index(None,True)
print(best_decomp_all)
# Subset the decomposition
df_decomp_within = df_decomp_within.merge(best_decomp_all, 'inner', ['outcome','version'])#.drop(columns='version')
# Find the best within model for the best version
best_decomp_mdl = df_decomp_within.groupby(['outcome','model']).apply(lambda x: np.sum(x.auc*x.den)/x.den.sum()).reset_index()
best_decomp_mdl = best_decomp_mdl.rename(columns={0:'auc'}).sort_values(['outcome','auc'],ascending=[True,False])
best_decomp_mdl = best_decomp_mdl.groupby(['outcome']).head(1).reset_index(None, True).drop(columns='auc')
print(best_decomp_mdl)
df_decomp_mdl = df_decomp.merge(best_decomp_all,'inner',['outcome','version'])#.drop(columns=['version'])
df_decomp_mdl = df_decomp_mdl.merge(best_decomp_mdl,'inner',['outcome','model']).query('method=="agg"')
df_decomp_mdl.drop(columns=['method','model'], inplace=True)

### (2.B) Inference on within/between AUROCs FOR BEST LABEL VERSION

In [None]:
fn_inf_within = 'dat_decomp.csv'
check_inf_within = fn_inf_within in os.listdir(dir_output)
if not check_inf_within:
    print('Running Bootstraps on within AUROC')
    stime = time()
    holder = []
    n_bs = 1000
    cn_drop = ['model','test_year','outcome','version','method']
    cn_gg = ['cpt','n1','n0','n1n0']
    squery = 'model==@model & test_year==@test_year & outcome==@outcome & version==@version & method=="agg"'
    for ii, rr in dat_decomp_within.iterrows():
        print('Row %i of %i' % (ii+1, dat_decomp_within.shape[0]))
        model, test_year, outcome, version = rr['model'], rr['test_year'], rr['outcome'], rr['version']
        tmp = df_nsqip.query(squery).copy()
        # Keep only CPTs with a positive class
        tmp2 = tmp.groupby('cpt').apply(lambda x: 
                pd.Series({'n1':sum(x.y==1),'n0':sum(x.y==0)})).reset_index()
        tmp2 = tmp2.query('n1 > 0').reset_index(None, True)
        tmp = tmp.merge(tmp2[['cpt']],'inner','cpt')
        # Remove superfluous columns
        tmp.drop(columns=cn_drop,inplace=True)
        # Get the internal ranks
        tmp = tmp.groupby('cpt').apply(lambda x: 
              pd.DataFrame({'y':x.y, 'preds':x.preds,'cpt':x.cpt,'r':rankdata(x.preds)})).sort_values('cpt')
        tmp = tmp.query('y == 1').reset_index(None, True)
        # ---- RUN VECTORIZED BOOTSTRAP ---- #
        tmp_bs = tmp.sample(frac=n_bs,replace=True, random_state=ii).reset_index(None,True)
        tmp_bs['idx'] = pd.Series(tmp_bs.index % n_bs).sample(frac=1, random_state=ii).values
        tmp_bs = tmp_bs.groupby(['idx','cpt']).apply(lambda x: pd.Series({'r_s':x.r.sum(),'n1':len(x.r)})).reset_index()
        tmp_bs = tmp_bs.merge(tmp2[['cpt','n0']]).assign(n1=lambda x: x.n1.astype(int), n1n0 = lambda x: x.n1 * x.n0)
        tmp_bs = tmp_bs.assign(auc = lambda x: (x.r_s - x.n1*(x.n1+1)/2)/x.n1n0)
        tmp_bs = tmp_bs.groupby('idx').apply(lambda x: np.sum(x.auc*x.n1n0)/np.sum(x.n1n0))
        tmp_bs = tmp_bs.reset_index().rename(columns={0:'auc'})
        tmp_bs = tmp_bs.assign(model=model, test_year=test_year, outcome=outcome, version=version)
        holder.append(tmp_bs)
        print(tmp_bs.head())
        # Estimate the runtime
        nsec, nleft = time() - stime, dat_decomp_within.shape[0] - (ii+1)
        rate = (ii+1)/nsec
        print('ETA: %0.1f minutes left' % (nleft / rate / 60))
    dat_inf_within = pd.concat(holder)
    dat_inf_within.to_csv(os.path.join(dir_output, fn_inf_within), index=False)
else:
    print('Loading existing file')
    dat_inf_within = pd.read_csv(os.path.join(dir_output, fn_inf_within))
# Use standard CI approach
cv = stats.norm.ppf(0.975)
cn_gg = ['model', 'test_year', 'outcome', 'version']
alpha = 0.05
cn_qq = [alpha/2, 1-alpha/2]
tmp_inf = dat_inf_within.groupby(cn_gg).auc.quantile(cn_qq).reset_index()
tmp_inf = tmp_inf.pivot_table('auc',cn_gg,'level_'+str(len(cn_gg))).rename(columns=dict(zip(cn_qq, ['lb','ub']))).reset_index()
dat_decomp_within = dat_decomp_within.merge(tmp_inf)
# tmp_inf = dat_inf_within.groupby(cn_gg).auc.std(ddof=1).reset_index().rename(columns={'auc':'se'})
# dat_decomp_within = dat_decomp_within.merge(tmp_inf).assign(lb=lambda x: x.auc-cv*x.se, ub=lambda x: x.auc+cv*x.se)

### (3.A) AUROC on the CPT level

In [None]:
cn_gg = ['model','outcome','version','method']  #,'test_year'

fn_decomp_cpt = 'dat_decomp_cpt.csv'
check_decomp_cpt = fn_decomp_cpt in os.listdir(dir_output)
if not check_decomp_cpt:
    print('Running decomposition for within')
    stime = time()
    dat_decomp_cpt = df_nsqip.groupby(cn_gg).apply(lambda x: 
                        fast_decomp(x.y.values, x.preds.values, x.cpt.values, ret_df=True))
    dtime = time() - stime
    print('Took %0.1f seconds to run decomp' % dtime)
    dat_decomp_cpt = dat_decomp_cpt.reset_index()
    dat_decomp_cpt = dat_decomp_cpt.drop(columns='level_'+str(len(cn_gg)))
    dat_decomp_cpt.to_csv(os.path.join(dir_output, fn_decomp_cpt), index=False)
else:
    print('Loading decomposition by the cpt')
    dat_decomp_cpt = pd.read_csv(os.path.join(dir_output, fn_decomp_cpt))

In [None]:
gg = ['model','outcome','version']
# tmp_decomp = dat_decomp_cpt.merge(best_decomp_all, 'inner', ['outcome','version'])
tmp_decomp = dat_decomp_cpt.query("method=='agg'").groupby(gg).apply(lambda x: 
       pd.Series({'auc':x.auc.mean(), 'wauc':np.sum(x.auc*x.n0n1)/x.n0n1.sum(),'den':x.n0n1.sum()}))
tmp_decomp = tmp_decomp.reset_index().melt(gg+['den'],None,'tt').assign(den=lambda x: x.den.astype(int))

tmp_decomp2 = dat_decomp_within.groupby(gg).apply(lambda x: 
       pd.Series({'auc':x.auc.mean(), 'wauc':np.sum(x.auc*x.den)/x.den.sum(),'den':x.den.sum()}))
tmp_decomp2 = tmp_decomp2.reset_index().melt(gg+['den'],None,'tt').assign(den=lambda x: x.den.astype(int))

In [None]:
tmp_decomp.merge(tmp_decomp2, 'inner',gg+['tt']).assign(
    err_auc=lambda x: x.value_x - x.value_y, err_den=lambda x: x.den_x-x.den_y).head(4)


In [None]:
dat_decomp_within.query('model=="logit"&outcome=="adv"')

In [None]:
dat_decomp_cpt.head(1)

In [None]:
dat_decomp_cpt.query('g=="c21175"').sort_values('auc')

### (3.B) Inference on the CPT level, aggregated over years for BEST MODEL

In [None]:
df_nsqip_cpt = df_nsqip.merge(best_decomp_all,'inner').merge(best_decomp_mdl,'inner').query('method=="agg"')
df_nsqip_cpt = df_nsqip_cpt.reset_index(None,True).drop(columns=['method','model','version'])

In [None]:
# Distribution of counts over all years
qq = df_nsqip_cpt.groupby(['outcome','cpt']).y.sum().reset_index()
(ggplot(qq,aes(x='np.log(y+1)')) + geom_histogram(color='black',fill='red',alpha=0.5,bins=25) + 
theme_bw() + facet_wrap('~outcome',scales='free'))

In [None]:
np.exp(2)

#### NUMBER OF CPTS PER OUTCOME/CLASS

In [None]:
tmp1 = dat_decomp_cpt.groupby(['model','test_year','outcome','method','g']).size().reset_index().drop(columns=0)
tmp2 = tmp1.groupby(['model','test_year','outcome','method']).size().reset_index().rename(columns={0:'n'})
tmp2.groupby('method').n.describe()

### (2.C) Merge with the inference

In [None]:
fn_boot = fn_output[fn_output.str.contains('boot_')].to_list()

for fn in fn_boot[0:1]:
    tmp = pd.read_csv(os.path.join(dir_output, fn),nrows=100)
    print('------ %s ------' % fn)
    print(tmp.head(2))

In [None]:
fn_sig = fn_output[fn_output.str.contains('sig_')].to_list()

for fn in fn_sig[0:1]:
    tmp = pd.read_csv(os.path.join(dir_output, fn),nrows=100)
    print('------ %s ------' % fn)
    print(tmp.head(2))

## (3) Decompose variation

### (3.A) Variation between models

There is little variation between models in performance. Exception for a few kidney models in some years. Also shows that for any given model, there is very little variation between years. 

In [None]:
tmp = dat_decomp_within.assign(model=lambda x: x.model.map(di_model)).copy()
posd = position_dodge(0.5)
w, h= 8, 4
plotnine.options.figure_size = (w, h)
gg_auc_within = (ggplot(tmp, aes(x='test_year.astype(str)',y='auc',color='model')) + 
                    theme_bw() + geom_point(position=posd) + 
                 geom_linerange(aes(ymin='lb',ymax='ub'),position=posd) + 
                facet_wrap('~outcome',labeller=labeller(outcome=di_outcome)) + 
                    theme(axis_text_x=element_text(angle=90)) + #,axis_ticks_minor_y=element_blank()
                    labs(y='Within-CPT AUROC',x='Test year') + 
                scale_y_continuous(limits=[0.25,1],breaks=list(np.arange(0.25,1.01,0.25))) + 
                geom_hline(yintercept=0.5,linetype='--') + 
                scale_color_discrete(name='Model') + 
                ggtitle('Linerange shows 95% bootstrap CI'))
gg_auc_within.save(os.path.join(dir_figures,'gg_auc_within.png'),base_height=h, base_width=w, verbose=False)
gg_auc_within

In [None]:
tmp = dat_decomp_mdl.query('tt != "total"')
w, h = 8, 4
plotnine.options.figure_size = (w, h)
gg_between = (ggplot(tmp,aes(x='test_year.astype(str)',y='auc',color='tt')) + 
              theme_bw() + geom_point(position=posd) + 
              facet_wrap('~outcome',labeller=labeller(outcome=di_outcome)) + 
             scale_color_discrete(name='AUC type',labels=['Between','Within']) + 
             labs(y='AUROC',x='Test year') + 
              geom_hline(yintercept=0.5,linetype='--') + 
             scale_y_continuous(limits=[0.25,1],breaks=list(np.arange(0.25,1.01,0.25))))
gg_between

### (3) Use figures to visualize

1. Largest variation between: between vs within AUROC
2. The within AUROC has quite a lot of variation over CPT codes
3. \# of CPTs for a given year that make the cut-off

### (3.A) Within AUROC performance is similar acorss models

In [None]:
# xlabs = [di_model[cc] for cc in dat_decomp_within_best.model.unique()]
# gg_decomp_within = (ggplot(dat_decomp_within_best, aes(x='model',y='auc',color='model')) + 
#                     theme_bw() + geom_boxplot() + 
#                     facet_wrap('~outcome',labeller=labeller(outcome=di_outcome)) + 
#                     theme(legend_position='none',axis_title_x=element_blank(),axis_text_x=element_text(angle=90)) + 
#                     labs(y='Within-CPT AUROC') +
#                     ggtitle('Within AUORIC variation over years/CPT-subset') + 
#                     scale_x_discrete(labels=xlabs))
# h, w = 4, 8
# gg_decomp_within.save(os.path.join(dir_figures,'gg_decomp_within.png'),base_height=h, base_width=w, verbose=False)
# plotnine.options.figure_size = (w, h)
# gg_decomp_within

In [None]:
# gg_decomp_mdl = (ggplot(dat_decomp_within_mdl, aes(x='outcome',y='auc',shape='model',color='test_year')) + 
#                     theme_bw() + geom_jitter(random_state=1, width=0.1, height=0) + 
#                     theme(axis_title_x=element_blank(),axis_text_x=element_text(angle=90)) + 
#                     labs(y='Within-CPT AUROC') +
#                     ggtitle('Within AUORIC variation over years\nOnly shows best performing model') + 
# #                 scale_color_discrete(name='Method',labels=list(di_method.values())) + 
#                 scale_shape_manual(name='Model', labels=list(di_model.values()), values=['$L$','$R$','$X$']) + 
#                 scale_x_discrete(labels=list(di_outcome.values())))
# h, w = 4.5, 7
# # gg_decomp_mdl.save(os.path.join(dir_figures,'gg_decomp_mdl.png'),base_height=h, base_width=w, verbose=False)
# plotnine.options.figure_size = (w, h)
# gg_decomp_mdl

In [None]:
# di_method = {'agg':'Aggregate', 'sub':'Subset'}
