In [1]:
import yaml
from datetime import datetime

from sqlalchemy.engine.url import URL
import pandas as pd
import matplotlib

from triage.util.db import create_engine
from triage.component.audition import Auditioner
from triage.component.audition.rules_maker import SimpleRuleMaker, RandomGroupRuleMaker, create_selection_grid

from triage.component.postmodeling.contrast.parameters import PostmodelParameters
from triage.component.postmodeling.contrast.model_evaluator import ModelEvaluator
from triage.component.postmodeling.contrast.model_group_evaluator import ModelGroupEvaluator

In [2]:
# %matplotlib inline

In [3]:
# creating database engine
with open('database.yaml', "r") as f:
    dbconfig = yaml.safe_load(f)

db_url = URL(
            'postgres',
            host=dbconfig['host'],
            username=dbconfig['user'],
            database=dbconfig['db'],
            password=dbconfig['pass'],
            port=dbconfig['port'],
        )

db_engine = create_engine(db_url)


  


In [4]:
experiment_id = '55d27c5ef9a448f417892bc7d20f9b6a'

mg_id_query = db_engine.execute('''
select distinct model_group_id from 
model_metadata.model_groups
join model_metadata.models using(model_group_id)
where built_by_experiment = '{}'
'''.format(experiment_id))

model_group_ids = [i[0] for i in mg_id_query]

In [5]:
len(model_group_ids)

624

In [6]:
train_end_times = db_engine.execute('''
select distinct train_end_time::timestamp
from model_metadata.models
where built_by_experiment ='{experiment}'
order by train_end_time
'''.format(experiment = experiment_id)) 

train_end_times = [i[0] for i in train_end_times if i[0] > datetime(2012,1,1,0,0)]

In [7]:
train_end_times

[datetime.datetime(2012, 2, 1, 0, 0),
 datetime.datetime(2012, 3, 1, 0, 0),
 datetime.datetime(2012, 4, 1, 0, 0),
 datetime.datetime(2012, 5, 1, 0, 0),
 datetime.datetime(2012, 6, 1, 0, 0),
 datetime.datetime(2012, 7, 1, 0, 0),
 datetime.datetime(2012, 8, 1, 0, 0),
 datetime.datetime(2012, 9, 1, 0, 0),
 datetime.datetime(2012, 10, 1, 0, 0),
 datetime.datetime(2012, 11, 1, 0, 0),
 datetime.datetime(2012, 12, 1, 0, 0),
 datetime.datetime(2013, 1, 1, 0, 0)]

In [8]:
aud = Auditioner(
    db_engine,
    model_group_ids,
    train_end_times,
    initial_metric_filters=
    [{'metric':'precision@',
      'parameter':'50_abs',
      'max_from_best':0.2,
#       'max_from_best':1,
#       'threshold_value':0
      'threshold_value':0.5
     }],
    models_table = 'models',
    directory = 'triage_output'
)

In [9]:
aud.plot_model_groups()

In [10]:
ids = aud.thresholded_model_group_ids

In [11]:
Rule1 = SimpleRuleMaker()
Rule1.add_rule_best_current_value(metric='precision@', parameter='50_abs', n=3)
Rule1.add_rule_best_average_value(metric = 'precision@', parameter = '50_abs', n=3)
Rule1.add_rule_best_avg_var_penalized(metric='precision@', parameter='50_abs', stdev_penalty=[0.3, 0.5, 0.8], n=3)
Rule1.add_rule_most_frequent_best_dist(
        metric='precision@',
        parameter='50_abs',
        dist_from_best_case=[0.1, .05, .03, .01],
        n=3)

Rule2 = RandomGroupRuleMaker(n=5)

rules = create_selection_grid(Rule1, Rule2)

In [12]:
aud.register_selection_rule_grid(rules, plot=True)

In [13]:
selected_groups = aud.selection_rule_model_group_ids
selected_groups

{'best_current_value_precision@_50_abs': [6082, 6117, 5955],
 'best_average_value_precision@_50_abs': [5950, 6165, 5959],
 'best_avg_var_penalized_precision@_50_abs_0.3': [5950, 6082, 5959],
 'best_avg_var_penalized_precision@_50_abs_0.5': [5950, 6082, 6054],
 'best_avg_var_penalized_precision@_50_abs_0.8': [6082, 5950, 6108],
 'most_frequent_best_dist_precision@_50_abs_0.1': [5950, 5967, 6041],
 'most_frequent_best_dist_precision@_50_abs_0.05': [5955, 5959, 6165],
 'most_frequent_best_dist_precision@_50_abs_0.03': [6176, 6259, 5941],
 'most_frequent_best_dist_precision@_50_abs_0.01': [6061, 6063, 6082],
 'random_model_group': [5937, 6259, 5979, 5963, 6126]}

Calculating most commonly selected models

In [14]:
aud.average_regret_for_rules

{'precision@50_abs': {'best_average_value_precision@_50_abs': 0.10872727272727273,
  'best_avg_var_penalized_precision@_50_abs_0.3': 0.116,
  'best_avg_var_penalized_precision@_50_abs_0.5': 0.116,
  'best_avg_var_penalized_precision@_50_abs_0.8': 0.116,
  'best_current_value_precision@_50_abs': 0.0905454545454545,
  'most_frequent_best_dist_precision@_50_abs_0.01': 0.10145454545454546,
  'most_frequent_best_dist_precision@_50_abs_0.03': 0.0996363636363636,
  'most_frequent_best_dist_precision@_50_abs_0.05': 0.10145454545454545,
  'most_frequent_best_dist_precision@_50_abs_0.1': 0.09418181818181816,
  'random_model_group': 0.10872727272727274}}

In [15]:
mgs = {}
for value in selected_groups.values():
    for group_id in value:
        if group_id in mgs.keys():
            mgs[group_id] += 1
        else:
            mgs[group_id] = 1

mgs

{6082: 5,
 6117: 1,
 5955: 2,
 5950: 5,
 6165: 2,
 5959: 3,
 6054: 1,
 6108: 1,
 5967: 1,
 6041: 1,
 6176: 1,
 6259: 2,
 5941: 1,
 6061: 1,
 6063: 1,
 5937: 1,
 5979: 1,
 5963: 1,
 6126: 1}

In [16]:
model_groups = db_engine.execute('''
select distinct 
    model_group_id, 
    model_type, 
    hyperparameters 
from model_metadata.model_groups
where model_group_id = any(array{})
'''.format(selected_groups['best_current_value_precision@_50_abs']))

[i for i in model_groups]

[(5955, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 5, 'max_features': 12, 'n_estimators': 1000, 'min_samples_split': 25}),
 (6082, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 10, 'max_features': 12, 'n_estimators': 1000, 'min_samples_split': 50}),
 (6117, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 10, 'max_features': 12, 'n_estimators': 10000, 'min_samples_split': 50})]

In [17]:
aud.save_result_model_group_ids(fname="results_model_group_ids.json")