In [None]:
%load_ext autoreload
%autoreload 2

import os
from sqlalchemy import create_engine

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils

from IPython.display import Image
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

dburl = os.environ['DBURL']
engine = create_engine(dburl)

-----

In [None]:
best_6months = {'sklearn.ensemble.RandomForestClassifier': 20854,
                'triage.component.catwalk.estimators.classifiers.ScaledLogisticRegression': 20892,
                'Expert Rules': 21065
               }
best_mgs_by_for = [20890, 20888, 20868] # the best model groups selected by low disparity based on FOR by race

mgs_labels = {
    20854: 'Random Forest (1000 trees, no max depth)',
    20892: "Logistic Regression (L1 penalty, C=0.1)",
    21065: "Expert Baseline",
    20890: "Logistic Regression (L1 penalty, C=0.001)",
    20888: "Logistic Regression (L1 penalty, C=0.00001)",
    20868: "Decision Tree (no max depth)"
}
colors = {
    20854: '#00A1D5FF',
    20892: "#79AF97FF",
    21065: "#B24745FF",
    20890: '#374E55FF', 
    20888: '#374E55FF', 
    20868: '#374E55FF'
}

best_access_mid = model_id_to_plot = utils.get_model_id_at_time(best_6months['sklearn.ensemble.RandomForestClassifier'], '2015-01-01')

In [None]:
access_models_for = pd.DataFrame()
demo = utils.get_demographics(best_access_mid) # just demographics; combine with predictions later

metric = 'precision@'
parameter = '10.0_pct'
for mg_id in list(best_6months.values()) + best_mgs_by_for:
    print("----------------------------\n", mg_id)
    for_race = []
    p_at_10 = utils.get_model_evaluation_with_model(mg_id, metric, parameter) #utils.get_p_at_(m['model_id'], metric, parameter)
    for i, (m) in utils.get_models_same_mg(mg_id).iterrows():
        m_id = m['model_id']
        for_race = utils.get_for_race(m_id, demo)
        access_models_for = access_models_for.append({'model_group_id': mg_id, 'model_id': m_id,
                            'train_end_time': m['train_end_time'],
                            'p_at_10': p_at_10[p_at_10.model_id==m_id]['value'].values[0],
                            'for': for_race,
                       }, ignore_index=True)

In [None]:
metric = 'precision@'
parameter = '10.0_pct'
sns.set_style("whitegrid")

fig, ax = plt.subplots(1,figsize=(24, 10))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 4,"lines.markersize":12})
_ = plt.ylabel(f'{metric}:{parameter}')

for model_group_id in best_mgs_by_for:
    d = access_models_for[access_models_for.model_group_id == model_group_id].sort_values('train_end_time')
    _ = plt.plot(d.train_end_time, d['for'], label=mgs_labels[model_group_id],
                 marker='o',linestyle='-',color=colors[model_group_id], alpha=0.8)
for clf, model_group_id in best_6months.items():
    d = access_models_for[access_models_for.model_group_id == model_group_id].sort_values('train_end_time')
    _ = plt.plot(d.train_end_time, d['for'], label=mgs_labels[model_group_id],
                 marker='o',linestyle='-',color=colors[model_group_id])

_ = plt.ylabel(r"$\frac{FOR_{Black}}{FOR_{White}}$", fontsize=24)
_ = plt.xlabel("Year of Appointment for Validation Cohort", fontsize=24)
_ = plt.ylim(0.5, 1.8)    
_ = ax.axhspan(0.9, 1.1, alpha=0.3, color='#6A659999')
   
_ = plt.legend()
plt.show()

In [None]:
disp = access_models_for[access_models_for.model_group_id.isin(best_mgs_by_for + list(best_6months.values()))]
disp = disp.groupby('model_group_id')[['for', 'p_at_10']].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
disp.columns = ['_'.join(col).strip() for col in disp.columns.values]

In [None]:
def plot_2d_boxplot(plt, d, c, label):
    #r = d.for_max - d.for_min
    _    = plt.plot(d.p_at_10_mean, d.for_mean, marker='o', linestyle='', color=c, label=label)
    _ = plt.errorbar(d.p_at_10_mean, d.for_mean, 
                 xerr=[d.p_at_10_mean-d['p_at_10_25%'],d['p_at_10_75%']-d.p_at_10_mean], 
                 marker='', linestyle='', color=c, linewidth=10)
    _ = plt.errorbar(d.p_at_10_mean, d.for_mean, 
                 xerr=[d.p_at_10_mean-d['p_at_10_5%'],d['p_at_10_95%']-d.p_at_10_mean], 
                 marker='', linestyle='', color=c)
    _ = plt.errorbar(d.p_at_10_mean, d.for_mean, 
                 yerr=[d.for_mean-d['for_25%'],d['for_75%']-d.for_mean], 
                 marker='', linestyle='', color=c, linewidth=10)
    _ = plt.errorbar(d.p_at_10_mean, d.for_mean, 
                 yerr=[d.for_mean-d['for_5%'],d['for_95%']-d.for_mean], 
                 marker='', linestyle='', color=c)
    return(plt)


In [None]:
sns.set_style('whitegrid')                                                                                                                                      
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 2.25,"lines.markersize":20})                                                                  
fig, ax = plt.subplots(1,1,figsize=(24,10))

for i, row in disp.groupby('model_group_id'):
    plt = plot_2d_boxplot(plt, row, colors[i], mgs_labels[i])
_ = ax.axhspan(0.9, 1.1, alpha=0.3, color='#6A659999')
_ = plt.ylabel(r'$\frac{FOR_{Black}}{FOR_{White}}$', fontsize=40)
_ = plt.xlabel("Average Positive Predictive Value for top 10%")
_ = plt.ylim(0.5, 1.8)
h, l = ax.get_legend_handles_labels()
new_h = []
new_l = []
for i in range(0, len(h)):
    if 'mean' in l[i]:
        continue
    new_h.append(h[i])
    new_l.append(l[i])
_ = plt.legend(new_h, new_l, ncol=2, loc="upper center")
plt.show()