In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import logging
import os
logging.basicConfig(level=logging.INFO)

dburl = os.environ['DBURL']
engine = create_engine(dburl)
model_comment = 'CDPH_12months_access'
path_to_models = '/gpfs/data/dsapp-lab/triage-production_runs_small/trained_models/'
path_to_matrices = '/gpfs/data/dsapp-lab/triage-production_runs_small/matrices/'

In [None]:
def plot_time_splits(model_comment):
    with open('../../pipeline_CDPH_3.0/analysis/sql/time_splits.sql') as f:
        q = f.read()
    q = q.replace('model_comment_holder',model_comment)
    df_time_split = pd.read_sql(q, engine,
                                parse_dates=['train_start_time','train_end_time','test_start_time','test_end_time'])
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(1,figsize=(18, 12))
    sns.set_context("poster", font_scale=0.5, rc={"lines.linewidth": 1,"lines.markersize":4})
    for y, time_val in df_time_split.iterrows():
        train_start, train_end, test_start, test_end, train_label_timespan, test_label_timespan = time_val
        logging.info('train_end+train_label_timespan: {}+{}={}'.format(train_end,train_label_timespan,train_end+train_label_timespan))
        logging.info('train_label_timespan {} test_label_timespan {}'.format(train_label_timespan,test_label_timespan))
        _ = plt.plot([train_start,train_end], [y,y],marker='o',color='red')
        _ = plt.plot([train_end,train_end+train_label_timespan], [y,y],marker='o',linestyle='--',color='red')
        _ = plt.plot([test_start,test_end+test_label_timespan], [y,y],marker='o',linestyle='--',color='blue')
    _ = ax.axes.yaxis.set_ticklabels([])
    _ = plt.ylabel('time splits')
    _ = plt.xlabel('time')
    sns.despine()

In [None]:
def grab_model_evaluation(model_group_id, metric, parameter):

    q=f"""select
        evaluation_start_time,
        value
    from
        test_results.evaluations
    where
        model_id in (select distinct model_id from model_metadata.models where model_group_id = {model_group_id}) and
        metric = '{metric}' and
        parameter = '{parameter}' and
        evaluation_end_time < '2046-02-02'::date
    order by
          evaluation_end_time;

    """
    dfx = pd.read_sql(q,engine,parse_dates=['evaluation_start_time'])
    x,y= zip(*dfx.values.tolist())
    return x,y

In [None]:
def make_plot_label(row):
    hp = row['hyperparameters']
    if row['model_type'] == 'sklearn.ensemble.RandomForestClassifier':
        tag ='RF_'+str(row['model_group_id'])+'_n'+str(hp['n_estimators'])+'_d'+str(hp['max_depth'])
    elif row['model_type'] == 'sklearn.tree.DecisionTreeClassifier':
        tag = 'DT_'+str(row['model_group_id'])+'_d'+str(hp['max_depth'])
    elif 'LogisticRegression' in row['model_type']:
        tag = 'SLR_'+str(row['model_group_id'])+'_C'+str(hp['C'])+'_p'+str(hp['penalty'])
    elif row['model_type'] == 'xgboost.sklearn.XGBClassifier':
        tag = 'GB_'+str(row['model_group_id'])+'_n'+str(hp['n_estimators'])+'_d'+str(hp['max_depth'])
    elif row['model_type'] == 'sklearn.dummy.DummyClassifier':
        tag = 'Dummy'
    else:
        raise ValueError('Never Seen: '+ row['model_type'])
    return tag

In [None]:
def audition_graph(metric,parameter,ls_model_group_tag,legend=True, color=None):
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(1,figsize=(52, 24))
    sns.set_context("poster", font_scale=2., rc={"lines.linewidth": 1.25,"lines.markersize":18})
    _=plt.ylim(0,1)
    _=plt.ylabel(f'{metric}:{parameter}')

    for model_group_id, tag in ls_model_group_tag:
        x,y = grab_model_evaluation(model_group_id,metric,parameter)
        if color:
            _=plt.plot(x,y, label='model',marker='o',linestyle='-',linewidth=6, color='blue')
        else:
            _=plt.plot(x,y, label=tag,marker='o',linestyle='-',linewidth=6)


    x_baseline, y_baseline = grab_model_evaluation(model_group_id,metric,'100.0_pct')
    _=plt.plot(x_baseline,y_baseline, label='baserate',marker='o',linestyle='-',linewidth=6,color='grey')

    if legend:
        _=plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=6, borderaxespad=0.)
    sns.despine()

In [None]:
q="select * from model_metadata.models where model_comment = '{}';".format(model_comment)
df_models = pd.read_sql(q,engine)
df_models['plot_label'] = df_models.apply(lambda x: make_plot_label(x), axis=1)


In [None]:
best_mgs_1 = {'sklearn.tree.DecisionTreeClassifier': 21128,
 'triage.component.catwalk.estimators.classifiers.ScaledLogisticRegression': 21136,
 'sklearn.ensemble.RandomForestClassifier': 21144}

# 12 Months Access 

In [None]:
best_mgs_1 = {'sklearn.ensemble.RandomForestClassifier': 21144}

In [None]:
df_models_top1=df_models[df_models['model_group_id'].isin(best_mgs_1.values())]
ls_model_group_tag = df_models_top1[['model_group_id','plot_label']].drop_duplicates().values
metric = 'precision@'
parameter = '1.0_pct'
audition_graph(metric,parameter,ls_model_group_tag, color=True)

In [None]:
plot_time_splits(model_comment)

In [None]:
#production figure
df_models_top1=df_models[df_models['model_group_id'].isin(best_mgs_1.values())]

In [None]:
df_models[df_models.model_group_id == 21144]

In [None]:
q_features = "select * from train_results.feature_importances where model_id = 94982 order by feature_importance desc;"
df_features_94982 = pd.read_sql(q_features,engine)

In [None]:
df_features_94982.head()

In [None]:
df_features_94982['feature_group'] = df_features_94982.feature.apply(lambda x: x.split('_')[0])

df_feature_sum = df_features_94982[['feature_group','feature_importance']].groupby('feature_group').mean()

df_feature_sum = df_feature_sum.reset_index()

dict_english_mapping = {'cd4': 'cd4 tests',
                        'demographics': 'demographics',
                        'dxstatus': 'diagnosis status',
                        'location': 'location',
                       'prevappts': 'previous appointment history',
                       'trancateg': 'transmission category',
                        'vl': 'viral load tests'}

df_feature_sum['feature'] = df_feature_sum.reset_index()['feature_group'].apply(lambda x: dict_english_mapping[x])
df_feature_sum = df_feature_sum.set_index('feature')
fig, ax = plt.subplots(1,figsize=(12, 5))
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 1.25,"lines.markersize":8})
df_feature_sum['feature_importance'].sort_values(ascending=True).plot(kind='barh', xlim=(0,.05))
plt.ylabel('relative avg importance\n of a feature class')

In [None]:
df_feature_sum['feature_importance']

In [None]:
from triage.component.postmodeling.contrast.utils.aux_funcs import create_pgconn, get_models_ids
from triage.component.catwalk.storage import ProjectStorage, ModelStorageEngine, MatrixStorageEngine
from triage.component.postmodeling.contrast.parameters import PostmodelParameters
from triage.component.postmodeling.contrast.model_evaluator import ModelEvaluator
from triage.component.postmodeling.contrast.model_group_evaluator import ModelGroupEvaluator

In [None]:
modelevaluation  = ModelEvaluator(21144, 94982, engine)

In [None]:
modelevaluation.plot_precision_recall_n(figsize=(12,12))

In [None]:
modelevaluation.plot_precision_recall_n(figsize=(12,12), xlim=[0,0.05])

# 6 Month Access

In [None]:
model_comment = 'CDPH_6months_access_memo'
q="select * from model_metadata.models where model_comment = '{}';".format(model_comment)
df_models = pd.read_sql(q,engine)
df_models['plot_label'] = df_models.apply(lambda x: make_plot_label(x), axis=1)


In [None]:
best_mgs_1 = {'sklearn.ensemble.RandomForestClassifier': 21157}
df_models_top1=df_models[df_models['model_group_id'].isin(best_mgs_1.values())]
ls_model_group_tag = df_models_top1[['model_group_id','plot_label']].drop_duplicates().values
metric = 'precision@'
parameter = '1.0_pct'
audition_graph(metric,parameter,ls_model_group_tag, color=True)

In [None]:
plot_time_splits(model_comment='CDPH_6months_access_memo')

In [None]:
q_features = "select * from train_results.feature_importances where model_id = 96042 order by feature_importance desc;"
df_features_96042 = pd.read_sql(q_features,engine)
df_features_96042.head()

In [None]:
df_features_96042['feature_group'] = df_features_96042.feature.apply(lambda x: x.split('_')[0])

df_feature_sum = df_features_96042[['feature_group','feature_importance']].groupby('feature_group').mean()

df_feature_sum = df_feature_sum.reset_index()

dict_english_mapping = {'cd4': 'cd4 tests',
                        'demographics': 'demographics',
                        'dxstatus': 'diagnosis status',
                        'location': 'location',
                       'prevappts': 'previous appointment history',
                       'trancateg': 'transmission category',
                        'vl': 'viral load tests'}

df_feature_sum['feature'] = df_feature_sum.reset_index()['feature_group'].apply(lambda x: dict_english_mapping[x])
df_feature_sum = df_feature_sum.set_index('feature')
fig, ax = plt.subplots(1,figsize=(12, 5))
sns.set_context("poster", font_scale=1.5, rc={"lines.linewidth": 1.25,"lines.markersize":8})
df_feature_sum['feature_importance'].sort_values(ascending=True).plot(kind='barh', xlim=(0,.05))
plt.ylabel('relative avg importance\n of a feature class')

In [None]:
df_feature_sum['feature_importance']

In [None]:
modelevaluation  = ModelEvaluator(21157, 96042, engine)
modelevaluation.plot_precision_recall_n(figsize=(12,12))

In [None]:
modelevaluation.plot_precision_recall_n(figsize=(12,12), xlim=[0,0.05])