In [1]:
import yaml
from datetime import datetime

from sqlalchemy.engine.url import URL
import pandas as pd
import matplotlib

from triage.util.db import create_engine
from triage.component.audition import Auditioner
from triage.component.audition.rules_maker import SimpleRuleMaker, RandomGroupRuleMaker, TwoMetricsRuleMaker, create_selection_grid

from triage.component.postmodeling.contrast.parameters import PostmodelParameters
from triage.component.postmodeling.contrast.model_evaluator import ModelEvaluator
from triage.component.postmodeling.contrast.model_group_evaluator import ModelGroupEvaluator

In [2]:
# %matplotlib inline

In [3]:
# creating database engine
dbconfig = yaml.load(open('database.yaml'))
db_url = URL(
            'postgres',
            host=dbconfig['host'],
            username=dbconfig['user'],
            database=dbconfig['db'],
            password=dbconfig['pass'],
            port=dbconfig['port'],
        )

db_engine = create_engine(db_url)


  


In [4]:
experiment_id = '119d1575035b5ab7052c790f34f498e8'

mg_id_query = db_engine.execute('''
select distinct model_group_id from 
model_metadata.model_groups
join model_metadata.models using(model_group_id)
where built_by_experiment = '{experiment}'
'''.format(experiment = experiment_id))

model_group_ids = [i[0] for i in mg_id_query]

In [5]:
len(model_group_ids)

528

In [6]:
train_end_times = db_engine.execute('''
select distinct train_end_time::timestamp
from model_metadata.models
where built_by_experiment ='{experiment}'
order by train_end_time
'''.format(experiment = experiment_id))

train_end_times = [i[0] for i in train_end_times if i[0] > datetime(2012,1,1,0,0)]

In [7]:
train_end_times

[datetime.datetime(2012, 2, 1, 0, 0),
 datetime.datetime(2012, 3, 1, 0, 0),
 datetime.datetime(2012, 4, 1, 0, 0),
 datetime.datetime(2012, 5, 1, 0, 0),
 datetime.datetime(2012, 6, 1, 0, 0),
 datetime.datetime(2012, 7, 1, 0, 0),
 datetime.datetime(2012, 8, 1, 0, 0),
 datetime.datetime(2012, 9, 1, 0, 0),
 datetime.datetime(2012, 10, 1, 0, 0),
 datetime.datetime(2012, 11, 1, 0, 0),
 datetime.datetime(2012, 12, 1, 0, 0),
 datetime.datetime(2013, 1, 1, 0, 0)]

In [18]:
aud = Auditioner(
    db_engine,
    model_group_ids,
    train_end_times,
    initial_metric_filters=
    [{'metric':'precision@',
      'parameter':'50_abs',
      'max_from_best':0.2,
#       'max_from_best':1,
#       'threshold_value':0
      'threshold_value':0.5
     }],
    models_table = 'models',
    directory = 'triage_output'
)

In [19]:
aud.plot_model_groups()

In [20]:
ids = aud.thresholded_model_group_ids

In [21]:
Rule1 = SimpleRuleMaker()
Rule1.add_rule_best_current_value(metric='precision@', parameter='50_abs', n=3)
Rule1.add_rule_best_average_value(metric = 'precision@', parameter = '50_abs', n=3)
Rule1.add_rule_best_avg_var_penalized(metric='precision@', parameter='50_abs', stdev_penalty=[0.3, 0.5, 0.8], n=3)
Rule1.add_rule_most_frequent_best_dist(
        metric='precision@',
        parameter='50_abs',
        dist_from_best_case=[0.1, .05, .03, .01],
        n=3)
rules = create_selection_grid(Rule1)

In [12]:
aud.register_selection_rule_grid(rules, plot=True)

In [13]:
selected_groups = aud.selection_rule_model_group_ids
selected_groups

{'best_current_value_precision@_50_abs': [4611, 4656, 4653],
 'best_average_value_precision@_50_abs': [4628, 4493, 4652],
 'best_avg_var_penalized_precision@_50_abs_0.3': [4628, 4493, 4627],
 'best_avg_var_penalized_precision@_50_abs_0.5': [4628, 4493, 4627],
 'best_avg_var_penalized_precision@_50_abs_0.8': [4628, 4493, 4627],
 'most_frequent_best_dist_precision@_50_abs_0.1': [4459, 4466, 4475],
 'most_frequent_best_dist_precision@_50_abs_0.05': [4558, 4468, 4493],
 'most_frequent_best_dist_precision@_50_abs_0.03': [4561, 4575, 4585],
 'most_frequent_best_dist_precision@_50_abs_0.01': [4681, 4789, 4793]}

Calculating most commonly selected models

In [14]:
aud.average_regret_for_rules

{'precision@50_abs': {'best_average_value_precision@_50_abs': 0.11636363636363636,
  'best_avg_var_penalized_precision@_50_abs_0.3': 0.12545454545454546,
  'best_avg_var_penalized_precision@_50_abs_0.5': 0.13272727272727275,
  'best_avg_var_penalized_precision@_50_abs_0.8': 0.13272727272727275,
  'best_current_value_precision@_50_abs': 0.11272727272727275,
  'most_frequent_best_dist_precision@_50_abs_0.01': 0.1581818181818182,
  'most_frequent_best_dist_precision@_50_abs_0.03': 0.12181818181818183,
  'most_frequent_best_dist_precision@_50_abs_0.05': 0.1,
  'most_frequent_best_dist_precision@_50_abs_0.1': 0.14909090909090908}}

In [15]:
mgs = {}
for value in selected_groups.values():
    for group_id in value:
        if group_id in mgs.keys():
            mgs[group_id] += 1
        else:
            mgs[group_id] = 1

mgs

{4611: 1,
 4656: 1,
 4653: 1,
 4628: 4,
 4493: 5,
 4652: 1,
 4627: 3,
 4459: 1,
 4466: 1,
 4475: 1,
 4558: 1,
 4468: 1,
 4561: 1,
 4575: 1,
 4585: 1,
 4681: 1,
 4789: 1,
 4793: 1}

In [16]:
model_groups = db_engine.execute('''
select distinct 
    model_group_id, 
    model_type, 
    hyperparameters 
from model_metadata.model_groups
where model_group_id = any(array{})
'''.format(selected_groups['best_current_value_precision@_50_abs']))

[i for i in model_groups]

[(4611, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 10, 'max_features': 12, 'n_estimators': 1000, 'min_samples_split': 50}),
 (4653, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 10, 'max_features': 15, 'n_estimators': 10000, 'min_samples_split': 50}),
 (4656, 'sklearn.ensemble.RandomForestClassifier', {'max_depth': 10, 'max_features': 'log2', 'n_estimators': 1000, 'min_samples_split': 25})]

In [17]:
aud.save_result_model_group_ids(fname="results_model_group_ids.json")