In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')

import pandas as pd
pd.set_option('precision', 4)

from audition.distance_from_best import DistanceFromBestTable, BestDistancePlotter
from audition.thresholding import ModelGroupThresholder
from audition.regrets import RegretCalculator
import catwalk.db

### Creating the best distance table
Currently you need to specify the set of `model_group_id`s and `train_end_time`s you want to use manually, so here we're reading a few sets out of the database.

Additionally, you need to specify a name for the best distance table when creating the `DistanceFromBestTable` and should ensure it doesn't exist if you're going to run the `create_and_populate()` method.

For simplicity, we'll just look at precision@300_abs here.

In [None]:
conn = catwalk.db.connect()

In [None]:
sel = """
SELECT model_group_id
FROM results.model_groups
WHERE model_config->>'label_definition' = 'any_serious_violation'
ORDER BY RANDOM()
;
"""

model_groups = list(pd.read_sql(sel,conn)['model_group_id'])

In [None]:
sel = """
SELECT DISTINCT train_end_time
FROM results.models
WHERE model_group_id IN ({})
    AND EXTRACT(MONTH FROM train_end_time) IN (1,4,7,10)
    AND train_end_time >= '2012-01-01'
ORDER BY train_end_time
;
""".format(', '.join(map(str, model_groups)))

end_times = list(pd.read_sql(sel, conn)['train_end_time'])

In [None]:
dbt = DistanceFromBestTable(conn, 'models', 'kr_test_dist')
dbt.create_and_populate(model_groups, end_times, [{'metric' : 'precision@', 'parameter' : '300_abs'}])
plotter = BestDistancePlotter(dbt)

### Plotting the best distance metric
This is done with the `BestDistanceHistogrammer` class and make take a minute to generate.

In [None]:
plotter.plot_all_best_dist([{'metric' : 'precision@', 'parameter' : '300_abs'}], model_groups, end_times)

### Applying thresholds to weed out bad models
Here we use the `ModelGroupThresholder` to apply a set of filters to the model groups we're considering in order to elminate poorly performing ones.

In [None]:
mgt = ModelGroupThresholder(
    dbt,
    end_times,
    model_groups,
    initial_metric_filters=[{
        'metric': 'precision@',
        'parameter': '300_abs',
        'max_from_best': 0.2,
        'threshold_value': 0.0
    }]
)

Apply a round of filtering, starting with no threshold_value and a fairly wide margin on max_from_best

In [None]:
# how many model groups are left after the first round of filtering?
len(mgt.model_group_ids)

In [None]:
plotter.plot_all_best_dist(
    [{'metric' : 'precision@', 'metric_param' : '300_abs'}], 
    mgt.model_group_ids, 
    end_times
)

That didn't thin things out too much, so let's get a bit more agressive with both parameters:

In [None]:
mgt.update_filters([{
    'metric': 'precision@',
    'parameter': '300_abs',
    'max_from_best': 0.1,
    'threshold_value': 0.5
}])
len(mgt.model_group_ids)

In [None]:
plotter.plot_all_best_dist(
    [{'metric' : 'precision@', 'parameter' : '300_abs'}], 
    mgt.model_group_ids, 
    end_times
)

That's starting to look better, but we can probably narrow even a bit more...

In [None]:
mgt.update_filters([{
    'metric': 'precision@',
    'parameter': '300_abs',
    'max_from_best': 0.05,
    'threshold_value': 0.65
}])
len(mgt.model_group_ids)

In [None]:
plotter.plot_all_best_dist(
    [{'metric' : 'precision@', 'parameter' : '300_abs'}], 
    mgt.model_group_ids, 
    end_times
)

This looks like a better set of prospective models to consider. Could potentially even back off a little bit, but certainly seems like we've cleared out most of the worst models.

### Calculating regrets for the narrowed set of models

In [None]:
from audition.selection_rules import best_average_value

In [None]:
rc = RegretCalculator(dbt)

In [None]:
rc.regrets_for_rule(
    best_average_value, 
    mgt.model_group_ids, 
    sorted(end_times)[:-1], 
    'precision@', 
    '300_abs', 
    {'metric': 'precision@', 'parameter': '300_abs'}
)