# Model Selection
Uses model run information saved in database to compare performance of all models, and select top models based on precision at 10% and AUC.

In [None]:
import pandas as pd
import logging
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sqlalchemy import create_engine

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


In [None]:
# Helper Functions  
def run_query(query):
    engine = create_engine("postgresql:///kcmo-mc")
    db_conn = engine.connect()

    with db_conn.begin():
        df = pd.read_sql(query, db_conn)
    return df

def get_model_info(model_id):
    query = f"""SELECT * from modeling.model_metadata
            LEFT JOIN  modeling.model_set_metadata 
            USING (model_set_id)
            WHERE model_id = '{model_id}'
            ;"""

    df = run_query(query)
    # For now return only first row - all roes should contain the same info except for run
    return df.iloc[0,:]

In [None]:
def get_metrics_df():
    query = f"""select distinct
	msm.model_set_id, 
	msm.model_type,
	model_hyperparams,
	temporal_params,
	md.model_id, 
	run_id,
	md.train_start_date::date,
	md.train_end_date::date, 
	md.val_start_date::date,
	md.val_end_date::date,
	md.val_size,
	md.val_count_positive,
	DATE(md.started_training_at) as started_training_at,
	metric_name, 
	metric_param, 
	value
	from modeling.model_set_metadata msm 
	left join modeling.model_metadata md
		on msm.model_set_id = md.model_set_id
	left join modeling.model_metrics mm 
		on mm.model_id = md.model_id 
	-- where metric_name = 'precision' 
	-- and metric_param in ('5', '10', '50', '90', '95', '100')
	order by started_training_at, val_end_date::date;"""

    return run_query(query)
metrics_df  = get_metrics_df()
metrics_df['metric_param'] = metrics_df['metric_param'].fillna('None')
metrics_df.shape[0]

In [None]:
# Plot AUC over the time splits 
def audition_plot(metrics_df, metric_name, metric_param):
    prec_10 = metrics_df[(metrics_df['metric_name']==metric_name) & (metrics_df['metric_param']==metric_param)]
    fig, ax = plt.subplots(1, figsize=(15,10))
    ax = sns.lineplot(x = 'val_end_date', y = 'value', data = prec_10, hue = 'model_type', style = 'model_set_id', ax=ax)
    ax.set_title(metric_name + "@" + str(metric_param))
    ax.set_xlabel('Validation End Date')
    ax.set_ylabel(metric_name)
    ax.legend(bbox_to_anchor=(1.3, .6), loc='right')
    ax.set_ylim(0,1)

audition_plot(metrics_df, metric_name = 'AUC', metric_param = 'None')

In [None]:
# Plot precision over splits 
audition_plot(metrics_df, metric_name = 'precision', metric_param = 10)

In [None]:
# Get top models by precision at 10%
def get_top_models(metric_name = 'precision', metric_param = 10, top_n = 3):
    # Calculate mean value for each metric 
    metrics_df['model_hyperparams_str'] = metrics_df['model_hyperparams'].astype(str)
    set_metrics_df = metrics_df.groupby(['model_set_id','model_type','model_hyperparams_str', 'metric_name','metric_param']).mean()[['value']]
    set_metrics_df = set_metrics_df.reset_index(drop=False)
    # Narrow down to metric of interest (e.g. precision @ 10, top 3)
    set_metric_df = set_metrics_df[(set_metrics_df['metric_name'] == metric_name) & (set_metrics_df['metric_param'] == metric_param)]
    set_metric_df = set_metric_df.nlargest(n = top_n, columns='value', keep='first')
    # Left join all models in each model_set (model for each time split)
    model_metadata = run_query("SELECT * from modeling.model_metadata")
    model_metric_df = pd.merge(set_metric_df, model_metadata, on='model_set_id', how='left')
    # Select the models only with most recent validation scheme
    last_val_end_date = max(model_metric_df['val_end_date'])
    top_models = model_metric_df[model_metric_df['val_end_date'] == last_val_end_date]
    top_model_ids = list(top_models['model_id'])
    display(top_models[['model_id','model_set_id', 'model_type','model_hyperparams_str','metric_name','metric_param','value']])
    return (top_model_ids, top_models)

top_model_ids, top_models = get_top_models(metric_name = 'precision', metric_param = 10, top_n = 10)


In [None]:
# Get top models by AUC 
top_model_ids, top_models = get_top_models(metric_name = 'AUC', metric_param = 'None', top_n = 10)

In [None]:
# Query to get all models corresponding to a model_set_id (in this case, the first one)
model_set_id = top_models.loc[0,'model_set_id']
all_models_over_time = run_query(f"SELECT * FROM modeling.model_metadata where model_set_id = '{model_set_id}';")
all_models_over_time

In [None]:
# Plot Base Rate over time for precision understanding 
top_model_set_id = top_models.loc[0,'model_set_id']
one_model = metrics_df[(metrics_df['model_set_id'] == top_model_set_id)& (metrics_df['metric_name'] == 'AUC')].copy()
one_model['returns_base_rate'] = one_model['val_count_positive']/one_model['val_size']
fig, ax = plt.subplots(1, figsize=(10,7))
sns.lineplot(x = one_model['val_end_date'], y = one_model['returns_base_rate'], ax=ax)
ax.set_title("Base rate of returns within one year for individuals put on SIS probation over time")
