In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import aequitas.plot as ap
from aequitas.bias import Bias
from aequitas.group import Group
from postmodeling.evaluation import (
    get_predictions,
    rank_models,
    get_best_modelsets
)
from utils.helpers import (
    load_models,
    get_database_connection
)
import matplotlib
from joblib import load
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree
from utils.constants import LABEL_MAPPING, MODELS_PATH

In [None]:
db_conn = get_database_connection()

## Overview
This notebook creates precision plots for the best models and best baselines across label groups for Johnson and Douglas County, respectively. We select the best model sets across counties and label groups that minimize regret averaged over time splits. NB: One issue is that not all model sets were ran on all time splits.

## Finding best models and baselines

In [None]:
months_future = 6

In [None]:
# Used for the final figure in the technical report
best_joco = get_best_modelsets(db_conn, county='joco', rank_on='regret', top=1, months_future=months_future, min_dates=4)

In [None]:
# Use min_dates = 5 to avoid selecting baselines that ran for only 4 validation splits
best_joco_baselines = get_best_modelsets(
    db_conn, county='joco', rank_on='regret', top=1,
    model_types=['FeatureRanker', 'LinearRanker'], months_future=months_future, min_dates=5,
)

In [None]:
# Used for the final figure in the technical report, use min_dates = 6 to get more splits
best_doco = get_best_modelsets(db_conn, county='doco', rank_on='regret', top=1, months_future=months_future, min_dates=6)

In [None]:
# Used for the final figure in the technical report, use min_dates = 8 to get more splits
best_doco_baselines = get_best_modelsets(
    db_conn, county='doco', rank_on='regret', top=1,
    model_types=['FeatureRanker', 'LinearRanker'], months_future=months_future, min_dates=8
)

In [None]:
best_joco

In [None]:
best_joco_baselines

In [None]:
best_doco

In [None]:
best_doco_baselines

## Performance across label groups

In [None]:
best_joco['Baseline'] = False # Assumes that the best model never is a baseline
best_joco_baselines['Baseline'] = True
df_joco = pd.concat([best_joco, best_joco_baselines])

In [None]:
def plot_best_models(db_conn, model_set_ids, label_groups, baselines, metric='precision', county='joco', figsize=(14, 10), legend=True, ylim=[-0.025, 0.70]):
    model_set_id_str = ','.join(["'" + str(id) + "'" for id in model_set_ids])
    label_group_str = ','.join(["'" + label + "'" for label in label_groups])
    
    
    if county == 'doco':
        earliest_date = '2019-09-01'
        tablename = 'test_evaluations_doco_fixed'
    else:
        earliest_date = '2017-12-01'
        tablename = 'test_evaluations'
    
    query = f'''
    select * from results.{tablename} te
        join results.models m
        using(model_id)
        join results.model_sets ms
        using(model_set_id)
        join results.experiments e
        using(experiment_id)
        where metric='{metric}'
        and county = '{county}'
        and model_set_id in ({model_set_id_str})
        and label_group in ({label_group_str})
        and as_of_date >= '{earliest_date}'::date;
    '''

    df = pd.read_sql(query, db_conn)
    
    # NOTE: Add lower limit for Douglas
    
    county_k = 75 if county == 'joco' else 40
    df = df[df['county_k'] == county_k]
    
    # Only get those rows where model_set id and label group match
    df_shortened = []
    for id, label, is_baseline in zip(model_set_ids, label_groups, baselines):
        res = df[(df['model_set_id'] == id) & (df['label_group'] == label)].copy()
        res.loc[:, ['Type']] = 'Baseline' if is_baseline else 'Model'
        df_shortened.append(res)
        
    df = pd.concat(df_shortened)
    df['as_of_date'] = pd.to_datetime(df['as_of_date'], format='%Y-%m-%d')
    
    if county == 'doco':
        df = df[df['as_of_date'] > '2019-06-01']
    
    df = df.sort_values(by=['as_of_date'], ascending=True)
    df['As of date'] = df['as_of_date'].astype('string')
    df['Label group'] = df['label_group']
    
    plt.clf()
    plt.figure(figsize=figsize)
    sns.set(font_scale=1.5)
    sns.despine()
    sns.set_style('white')
    plt.rc("axes.spines", top=False, right=False)

    n_colors = df['Label group'].unique().size
    palette = sns.color_palette('colorblind', n_colors=n_colors)
    
    p = sns.lineplot(
        data=df, hue='Label group',
        x='As of date', y='value',
        style='Type', hue_order=sorted(LABEL_MAPPING.keys()),
        style_order=['Model', 'Baseline'], lw=4, legend=legend,
        palette=palette
    )
    county = 'Johnson' if county == 'joco' else 'Douglas'
    title = county + ' county: ' + metric.capitalize() + ' across label groups'
    
    ylabel = metric.capitalize() + ' at ' + str(county_k)
    plt.xticks(rotation=45)
    p.set(ylabel=ylabel, ylim=ylim)
    plt.title(title, fontsize=24)
    
    if legend:
        legend = plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, ncol=1, frameon=False)
    
    return p

In [None]:
p = plot_best_models(db_conn, df_joco.model_set_id, df_joco.label_group, df_joco.Baseline, county='joco', ylim=[-0.025, 0.80]);
#p.get_figure().savefig('precision_johnson_label_group.eps', dpi=200, bbox_inches='tight')

In [None]:
#p.get_figure().savefig('precision_johnson_label_group.eps', dpi=200, bbox_inches='tight')

In [None]:
best_doco['Baseline'] = False # Assumes that the best model never is a baseline
best_doco_baselines['Baseline'] = True
df_doco = pd.concat([best_doco, best_doco_baselines])

In [None]:
p = plot_best_models(db_conn, df_doco.model_set_id, df_doco.label_group, df_doco.Baseline, county='doco', ylim=[-0.025, 0.80]);

## Best Decision Trees
Quick investigation of the best decision trees.

In [None]:
# Plot the best decision tree
trees_douglas_potentially_fatal = load_models('DecisionTreeClassifier', 5157)
trees_douglas_deaths = load_models('DecisionTreeClassifier', 661)
trees_johnson_deaths = load_models('DecisionTreeClassifier', 886)

def get_feature_importance(tree):
    return tree.feature_names_in_[np.where(tree.feature_importances_)[0][0]]

In [None]:
# Tree for potentially fatal, Douglas county
latest_tree_pf = load(os.path.join(MODELS_PATH, 'DecisionTreeClassifier_5157_22549.joblib'))

# Tree for potentially fatal, Douglas county
latest_tree_death_d = load(os.path.join(MODELS_PATH, 'DecisionTreeClassifier_661_1898.joblib'))

# Tree for potentially fatal, Douglas county
latest_tree_death_j = load(os.path.join(MODELS_PATH, 'DecisionTreeClassifier_886_3190.joblib'))

In [None]:
plt.figure(figsize=(12,12))
plot_tree(latest_tree_pf, feature_names=latest_tree_pf.feature_names_in_);

In [None]:
plt.figure(figsize=(18,12))
plot_tree(latest_tree_death_d, feature_names=latest_tree_death_d.feature_names_in_, fontsize=10);

In [None]:
[get_feature_importance(tree) for tree in best_trees]

In [None]:
plt.figure(figsize=(18,12))
plot_tree(latest_tree_death_j, feature_names=latest_tree_death_j.feature_names_in_, fontsize=10);