# Analyze the best performing models of each type

In [1]:
# Load modules
import os
import sys
import pandas as pd
import numpy as np
import plotnine
from plotnine import *
# Load the help functions
from support.acc_funs import fast_auc, fast_decomp
from support.support_funs import makeifnot, decomp_var

from scipy.stats import rankdata
from time import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set directories
dir_base = os.getcwd()
dir_NSQIP = os.path.join(dir_base,'..')
dir_output = os.path.join(dir_NSQIP, 'output')
dir_figures = os.path.join(dir_NSQIP, 'figures')
makeifnot(dir_figures)

di_model = {'logit':'Logistic-L2', 'rf':'RandomForest', 'xgb':'XGBoost'}
di_outcome = {'adv':'ADV', 'aki':'AKI', 'cns':'CNS',
              'nsi':'nSSIs', 'ssi':'SSIs', 'unplan':'UPLN'}
di_method = {'agg':'Aggregate', 'sub':'CPT-model'}

In [2]:
# # Dummy data to make sure fast approximations line up
# y = np.array([1,1,1,0,0,0,0,0])
# s = np.array([6,5,7,5,3,2,2,1])
# g = np.array(['a','a','b','a','b','b','b','c'])
# print(fast_decomp(y,s,g,ret_df=True))

### (1) Load in data

In [3]:
fn_output = pd.Series(os.listdir(dir_output))
fn_best = fn_output[fn_output.str.contains('.csv$') & fn_output.str.contains('^best')].reset_index(None,True)

fn_csv = 'df_best.csv'
check_csv = fn_csv in os.listdir(dir_output)
if not check_csv:
    print('Loading data via loop')
    holder = []
    cn_keep = ['model','outcome','test_year','cpt','y','preds']
    for fn in fn_best:
        print('Loading file: %s' % fn)
        path = os.path.join(dir_output, fn)
        tmp_df = pd.read_csv(path, usecols=cn_keep)  #, nrows=10
        tmp_df.rename(columns={'model':'method'},inplace=True)
        mdl = fn.split('.')[0].split('_')[-1]
        tmp_df.insert(0,'model',mdl)
        holder.append(tmp_df)
        del tmp_df
        #break
    df_nsqip = pd.concat(holder).reset_index(None, True)
    del holder
    df_nsqip.to_csv(os.path.join(dir_output, fn_csv), index=False)
else:
    print('Loading large CSV file')
    df_nsqip = pd.read_csv(os.path.join(dir_output, fn_csv))

Loading large CSV file


In [4]:
df_nsqip.groupby(['model','method']).size()

model  method
logit  agg       4668432
       sub       3185519
rf     agg       4668432
       sub       3185519
xgb    agg       4668432
       sub       3185519
dtype: int64

In [5]:
# qq = df_nsqip.query('model=="logit" & test_year==2014 & outcome=="agg_ssi1"').groupby(['method','cpt']).size()
# qq = df_nsqip.query('model=="logit"').groupby(['method','outcome','test_year','cpt']).size().reset_index()
# qq.groupby(['method','outcome','test_year']).size().sort_values()

### (2.A) Get the decomposed scores

In [6]:
cn_gg = ['model','test_year','outcome','method']

fn_decomp = 'dat_decomp.csv'
check_decomp = fn_decomp in os.listdir(dir_output)
if not check_decomp:
    print('Running decomposition')
    stime = time()
    dat_decomp = df_nsqip.groupby(cn_gg).apply(lambda x: fast_decomp(x.y.values, x.preds.values, x.cpt.values))
    dtime = time() - stime
    print('Took %0.1f seconds to run decomp' % dtime)
    dat_decomp = dat_decomp.reset_index().drop(columns='level_'+str(len(cn_gg)))
    dat_decomp.to_csv(os.path.join(dir_output, fn_decomp), index=False)
else:
    print('Loading decomposition')
    dat_decomp = pd.read_csv(os.path.join(dir_output, fn_decomp))
# Clean up the outcome label
dat_decomp.outcome = dat_decomp.outcome.str.replace('agg_','')
dat_decomp['version'] = dat_decomp.outcome.str.replace('[^0-9]','')
dat_decomp.version = np.where(dat_decomp.version == '', '1', dat_decomp.version).astype(int)
dat_decomp.outcome = dat_decomp.outcome.str.replace('[^a-z]','')
# Subset to within
dat_decomp_within = dat_decomp.query('tt == "within"').reset_index(None, True).drop(columns='tt')
# Find the best outcome/version
best_decomp_all = dat_decomp_within.groupby(['outcome','version']).auc.mean().reset_index()
best_decomp_all = best_decomp_all.sort_values(['outcome','auc'],ascending=[True,False]).groupby('outcome').head(1)
best_decomp_all = best_decomp_all.drop(columns='auc').reset_index(None,True)
best_decomp_all
# # Subset the decomposition
# dat_decomp_within_best = best_decomp_all.merge(dat_decomp_within,'left',['outcome','version'])#.drop(columns=['version'])

Loading decomposition


Unnamed: 0,outcome,version
0,adv,1
1,aki,1
2,cns,1
3,nsi,4
4,ssi,1
5,unplan,2


### (2.B) Get within distribution as well

In [7]:
fn_decomp_cpt = 'dat_decomp_cpt.csv'
check_decomp_cpt = fn_decomp_cpt in os.listdir(dir_output)
if not check_decomp_cpt:
    print('Running decomposition for within')
    stime = time()
    dat_decomp_cpt = df_nsqip.groupby(cn_gg).apply(lambda x: fast_decomp(x.y.values, x.preds.values, x.cpt.values, ret_df=True))
    dtime = time() - stime
    print('Took %0.1f seconds to run decomp' % dtime)
    dat_decomp_cpt = dat_decomp_cpt.reset_index()
    dat_decomp_cpt.to_csv(os.path.join(dir_output, fn_decomp_cpt), index=False)
else:
    print('Loading decomposition by the cpt')
    dat_decomp_cpt = pd.read_csv(os.path.join(dir_output, fn_decomp_cpt))

Loading decomposition by the cpt


### (3.A) Decompose variation

In [17]:
dat_decomp_within.auc.describe()

count    360.000000
mean       0.655480
std        0.051711
min        0.434036
25%        0.635302
50%        0.653020
75%        0.671503
max        0.835229
Name: auc, dtype: float64

In [8]:
# # Find best model for 
# best_decomp_mdl = dat_decomp_within_best.groupby(['outcome','method','model']).auc.mean().reset_index()
# best_decomp_mdl = best_decomp_mdl.sort_values(['outcome','method','auc'],ascending=[True,True,False])
# best_decomp_mdl = best_decomp_mdl.groupby(['outcome','method']).head(1).reset_index(None, True).drop(columns='auc')
# dat_decomp_within_mdl = dat_decomp_within.merge(best_decomp_mdl,'left',['model','outcome','method'])

In [9]:
# # Get the average within AUROC
# dat_auc_version = dat_decomp.query('tt=="within"').groupby(['model','outcome','method','version']).auc.mean().reset_index()
# dat_auc_version = dat_auc_version.sort_values(['outcome','auc'],ascending=[True,False]).reset_index(None, True)
# best_version = dat_auc_version.groupby(['outcome']).head(1).drop(columns='auc').reset_index(None, True)

### (3) Use figures to visualize

1. Variation between no models: no free lunch
2. Variation over time: fairly similar?
3. Variation over method: aggregate models generally do the best
4. Show within AUROC for winning models
5. Distribution of within AUROC by....

### (3.A) Within AUROC performance is similar acorss models

In [10]:
# xlabs = [di_model[cc] for cc in dat_decomp_within_best.model.unique()]
# gg_decomp_within = (ggplot(dat_decomp_within_best, aes(x='model',y='auc',color='model')) + 
#                     theme_bw() + geom_boxplot() + 
#                     facet_wrap('~outcome',labeller=labeller(outcome=di_outcome)) + 
#                     theme(legend_position='none',axis_title_x=element_blank(),axis_text_x=element_text(angle=90)) + 
#                     labs(y='Within-CPT AUROC') +
#                     ggtitle('Within AUORIC variation over years/CPT-subset') + 
#                     scale_x_discrete(labels=xlabs))
# h, w = 4, 8
# gg_decomp_within.save(os.path.join(dir_figures,'gg_decomp_within.png'),base_height=h, base_width=w, verbose=False)
# plotnine.options.figure_size = (w, h)
# gg_decomp_within

In [11]:
# gg_decomp_mdl = (ggplot(dat_decomp_within_mdl, aes(x='outcome',y='auc',shape='model',color='test_year')) + 
#                     theme_bw() + geom_jitter(random_state=1, width=0.1, height=0) + 
#                     theme(axis_title_x=element_blank(),axis_text_x=element_text(angle=90)) + 
#                     labs(y='Within-CPT AUROC') +
#                     ggtitle('Within AUORIC variation over years\nOnly shows best performing model') + 
# #                 scale_color_discrete(name='Method',labels=list(di_method.values())) + 
#                 scale_shape_manual(name='Model', labels=list(di_model.values()), values=['$L$','$R$','$X$']) + 
#                 scale_x_discrete(labels=list(di_outcome.values())))
# h, w = 4.5, 7
# # gg_decomp_mdl.save(os.path.join(dir_figures,'gg_decomp_mdl.png'),base_height=h, base_width=w, verbose=False)
# plotnine.options.figure_size = (w, h)
# gg_decomp_mdl

In [12]:
# di_method = {'agg':'Aggregate', 'sub':'Subset'}
