/
experiment.yaml
399 lines (377 loc) · 18.7 KB
/
experiment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
---
# CONFIG_VERSION
# The experiment configuration changes from time to time, and we upgrade the
# triage.experiments.CONFIG_VERSION variable whenever drastic changes that break
# old configuration files are released. Be sure to assign the config version
# that matches the triage.experiments.CONFIG_VERSION in the triage release
# you are developing against!
config_version: 'v7'
# EXPERIMENT METADATA
# model_comment (optional) will end up in the model_comment column of the
# models table for each model created in this experiment
model_comment: 'test'
# random_seed will be set in Python at the beginning of the experiment and
# affect the generation of all model seeds
random_seed: 23895478
# TIME SPLITTING
# The time window to look at, and how to divide the window into
# train/test splits
temporal_config:
feature_start_time: '1995-01-01' # earliest date included in features
feature_end_time: '2015-01-01' # latest date included in features
label_start_time: '2012-01-01' # earliest date for which labels are avialable
label_end_time: '2015-01-01' # day AFTER last label date (all dates in any model are < this date)
model_update_frequency: '6month' # how frequently to retrain models
training_as_of_date_frequencies: '1day' # time between as of dates for same entity in train matrix
test_as_of_date_frequencies: '3month' # time between as of dates for same entity in test matrix
max_training_histories: ['6month', '3month'] # length of time included in a train matrix
test_durations: ['0day', '1month', '2month'] # length of time included in a test matrix (0 days will give a single prediction immediately after training end)
training_label_timespans: ['1month'] # time period across which outcomes are labeled in train matrices
test_label_timespans: ['7day'] # time period across which outcomes are labeled in test matrices
# COHORT CONFIG
# Cohorts are configured by passing a query with placeholders for the 'as_of_date'.
#
# The 'query' key should have a query, parameterized with an '{as_of_date}', to select the entity_ids that should be included for a given date. The {as_of_date} will be replaced with each as_of_date that the experiment needs. The returned 'entity_id' must be an integer.
#
# You may enter a 'name' for your configuration.
# This will be included in the metadata for each matrix and used to group models
# If you don't pass one, the string 'default' will be used.
#
cohort_config:
query: "select entity_id from events where outcome_date < '{as_of_date}'"
name: 'past_events'
# LABEL GENERATION
# Labels are configured by passing a query with placeholders for the 'as_of_date' and 'label_timespan'.
#
# The query must return two columns: entity_id and outcome, based on a given as_of_date and label_timespan.
# The as_of_date and label_timespan must be represented by placeholders marked by curly brackets. The example below
# reproduces the inspection outcome boolean-or logic:
#
# In addition, you can configure what label is given to entities that are in the matrix
# (see 'cohort_config' section) but that do not show up in this label query.
# By default, these will show up as missing/null.
# However, passing the key 'include_missing_labels_in_train_as' allows you to pick True or False.
#
# In addition to these configuration options, you can pass a name to apply to the label configuration
# that will be present in matrix metadata for each matrix created by this experiment,
# under the 'label_name' key. The default label_name is 'outcome'.
label_config:
query: |
select
events.entity_id,
bool_or(outcome::bool)::integer as outcome
from events
where '{as_of_date}' <= outcome_date
and outcome_date < '{as_of_date}'::timestamp + interval '{label_timespan}'
group by entity_id
#include_missing_labels_in_train_as: False
#name: 'inspections'
# FEATURE GENERATION
# The aggregate features to generate for each train/test split
#
# Implemented by wrapping collate: https://github.com/dssg/collate
# Most terminology here is taken directly from collate
#
# Each entry describes a collate.SpacetimeAggregation object, and the
# arguments needed to create it. Generally, each of these entries controls
# the features from one source table, though in the case of multiple groups
# may result in multiple output tables
#
# Rules specifying how to handle imputation of null values must be explicitly
# defined in your config file. These can be specified in two places: either
# within each feature or overall for each type of feature (aggregates_imputation,
# categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for
# each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all
# can be specified with `all`. Aggregation function-specific rules will take
# precedence over the `all` rule and feature-specific rules will take
# precedence over the higher-level rules. Several examples are provided below.
#
# Available Imputation Rules:
# * mean: The average value of the feature (for SpacetimeAggregation the
# mean is taken within-date).
# * constant: Fill with a constant value from a required `value` parameter.
# * zero: Fill with zero.
# * null_category: Only available for categorical features. Just flag null
# values with the null category column.
# * binary_mode: Only available for aggregate column types. Takes the modal
# value for a binary feature.
# * error: Raise an exception if any null values are encountered for this
# feature.
feature_aggregations:
-
# prefix given to the resultant tables
prefix: 'prefix'
# from_obj is usually a source table but can be an expression, such as
# a join (ie 'cool_stuff join other_stuff using (stuff_id)')
from_obj: 'cool_stuff'
# The date column to use for specifying which records to include
# in temporal features. It is important that the column used specifies
# the date at which the event is known about, which may be different
# from the date the event happened.
knowledge_date_column: 'open_date'
# top-level imputation rules that will apply to all aggregates functions
# can also specify categoricals_imputation or array_categoricals_imputation
#
# You must specified at least one of the top-level or feature-level imputation
# to cover ever feature being defined.
aggregates_imputation:
# The `all` rule will apply to all aggregation functions, unless over-
# ridden by a more specific one
all:
# every imputation rule must have a `type` parameter, while some
# (like 'constant') have other required parameters (`value` here)
type: 'constant'
value: 0
# specifying `max` here will take precedence over the `all` rule for
# aggregations using a MAX() function
max:
type: 'mean'
# aggregates and categoricals define the actual features created. So
# at least one is required
#
# Aggregates of numerical columns. Each quantity is a number of some
# sort, and the list of metrics are applied to each quantity
aggregates:
-
quantity: 'homeless::INT'
# Imputation rules specified at the level of specific features
# will take precedence over the higer-level rules specified
# above. Note that the 'count' and 'sum' metrics will be
# imputed differently here.
imputation:
count:
type: 'mean'
sum:
type: 'constant'
value: 137
metrics:
- 'count'
- 'sum'
coltype: 'smallint' # Optional, if you want to control the column type in the generated features tables
-
# since we're specifying `aggregates_imputation` above,
# a feature-specific imputation rule can be omitted
quantity: 'some_flag'
metrics:
- 'max'
- 'sum'
# Categorical features. The column given can be of any type, but the
# choices must comparable to that type for equality within SQL
# The result will be one feature for each choice/metric combination
categoricals:
-
column: 'color'
# note that we haven't specified a top level `categoricals_imputation`
# set of rules, so we have to include feature-specific imputation
# rules for both of our categoricals here.
imputation:
sum:
type: 'null_category'
max:
type: 'mean'
choices:
- 'red'
- 'blue'
- 'green'
metrics:
- 'sum'
-
column: 'shape'
# as with the top-level imputation rules, `all` can be used
# for the feature-level rules to specify the same type of
# imputation for all aggregation functions
imputation:
all:
type: 'zero'
choice_query: 'select distinct shape from cool_stuff'
metrics:
- 'sum'
# The time intervals over which to aggregate features
intervals:
- '1 year'
- '2 years'
- 'all'
# A list of different columns to separately group by
groups:
- 'entity_id'
# FEATURE GROUPING
# define how to group features and generate combinations
# feature_group_definition allows you to create groups/subset of your features
# by different criteria.
# for instance,
# - 'tables' allows you to send a list of collate feature tables (collate builds these by appending 'aggregation_imputed' to the prefix)
# - 'prefix' allows you to specify a list of feature name prefixes
feature_group_definition:
tables: ['prefix_aggregation_imputed']
# strategies for generating combinations of groups
# available: all, leave-one-out, leave-one-in, all-combinations
feature_group_strategies: ['all']
# USER METADATA
# These are arbitrary keys/values that you can have Triage apply to the
# metadata for every matrix in the experiment. Any keys you include here can
# be used in the 'model_group_keys' below. For example, if you run experiments
# that share a temporal configuration but that use different label definitions
# (say, labeling building inspections with *any* violation as positive or
# labeling only building inspections with severe health and safety violations
# as positive), you can use the user metadata keys to indicate that the matrices
# from these experiments have different labeling criteria. The matrices from the
# two experiments will have different filenames (and not be overwritten or
# inappropriately reused), and if you add the label_definition key to the model
# group keys, models made on different label definition will have different
# groups. In this way, user metadata can be used to expand Triage beyond its
# explicitly supported functionality.
user_metadata:
label_definition: 'severe_violations'
# MODEL GROUPING (optional)
# Model groups are a way of partitioning trained models in a way that makes for easier analysis.
#
# model_group_keys defines a list of training matrix metadata and classifier keys that
# should be considered when creating a model group.
#
# There is an extensive default configuration, which is aimed at producing groups whose
# constituent models are equivalent to each other in all ways except for when they were trained.
# This makes the analysis of model stability easier.
#
# To accomplish this, the following default keys are used:
# 'class_path', 'parameters'
# 'feature_names', 'feature_groups', 'cohort_name', 'state'
# 'label_name', 'label_timespan', 'as_of_date_frequency', 'max_training_history'
#
# If you want to override this list, you can supply a 'model_group_keys' value.
# All of the defaults are available, along with some other temporal information
# that could be useful for more specialized analyses:
#
# 'first_as_of_time', 'last_as_of_time', 'matrix_info_end_time', 'as_of_times', 'feature_start_time'
#
# You can also use any pieces of user_metadata that you included in this experiment definition,
# as they will be present in the matrix metadata.
#
# model_group_keys: ['feature_groups', 'label_definition']
# GRID CONFIGURATION
# The classifier/hyperparameter combinations that should be trained
#
# Each top-level key should be a class name, importable from triage. sklearn is
# available, and if you have another classifier package you would like available,
# contribute it to requirement/main.txt
#
# Each lower-level key is a hyperparameter name for the given classifier, and
# each value is a list of potential values. All possible combinations of
# classifiers and hyperparameters are trained.
grid_config:
'sklearn.ensemble.ExtraTreesClassifier':
n_estimators: [100,100]
criterion: [gini, entropy]
max_depth: [1,5,10,20,50]
max_features: [sqrt,log2]
min_samples_split: [2,5,10]
# catwalk's custom model wrapper ScaledLogisticRegression will
# automatically scale the train data prior to fitting the model
# and then use the same scaling for the test data
'triage.component.catwalk.estimators.classifiers.ScaledLogisticRegression':
penalty: ['l1', 'l2']
C: [0.01, 1]
# catwalk's PercentileRankOneFeature baseline will score each entity as
# as its percentile on the named feature. This is useful for comparing
# predictive models to simply ranking entities on a single feature.
'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
feature: ['feature_one', 'feature_two']
descend: True
# catwalk's SimpleThresholder baseline will evaluate each entity against
# a list of rules and classify entities as 1 based on whether they meet
# any or all of these rules, depending on whether 'or' or 'and' is
# passed as the logical_operator. This is useful for comparing
# predictive modeling against simple rule-based classification.
'triage.component.catwalk.baselines.thresholders.SimpleThresholder':
rules: [['feature_one > 3', 'feature_two <= 5']]
logical_operator: 'and'
# PREDICTION
# How predictions are computed for train and test matrices
#
# Rank tiebreaking - In the predictions.rank_abs and rank_pct columns, ties in the score
# are broken either at random or based on the 'worst' or 'best' options. 'worst' is the default.
#
# 'worst' will break ties with the ascending label value, so if you take the top 'k' predictions, and there are ties across the 'k' threshold, the predictions above the threshold will be negative labels if possible.
# 'best' will break ties with the descending label value, so if you take the top 'k' predictions, and there are ties across the 'k' threshold, the predictions above the threshold will be positive labels if possible.
# 'random' will choose one random ordering to break ties. The result will be affected by
# current state of Postgres' random number generator. Before ranking, the generator is seeded
# based on the *model*'s random seed.
#prediction:
# rank_tiebreaker: "worst"
# MODEL SCORING
# How each trained model is scored
#
# Each entry in 'testing_metric_groups' needs a list of one of the metrics defined in
# catwalk.evaluation.ModelEvaluator.available_metrics (contributions welcome!)
# Depending on the metric, either thresholds or parameters
#
# Parameters specify any hyperparameters needed. For most metrics,
# which are simply wrappers of sklearn functions, these
# are passed directly to sklearn.
#
# Thresholds are more specific: The list is dichotomized and only the
# top percentile or top n entities are scored as positive labels
#
# subsets, if passed, will add evaluations for subset(s) of the predictions to
# the subset_evaluations tables, using the same testing and training metric
# groups as used for overall evaluations but with any thresholds reapplied only
# to entities in the subset on the relevant as_of_dates. For example, when
# calculating precision@5_pct for the subset of women, the ModelEvaluator will
# count as positively labeled the top 5% of women, rather than any women in the
# top 5% overall. This is useful if, for example, different interventions will
# be applied to different subsets of entities (e.g., one program will provide
# subsidies to the top 500 women with children and another program will provide
# shelter to the top 150 women without children) and you would like to see
# whether a single model can be used for both applications. Subsets can also be
# used to see how a model's performance would be affected if the requirements
# for intervention eligibility became more restricted.
#
# Subsets should be a list of dictionaries with the following keys:
# - "name": a shorthand name for the subset
# - "query": a query that returns distinct entity_ids belonging to the
# subset on a given as_of_date with a placeholder for the
# as_of_date being queried
scoring:
testing_metric_groups:
-
metrics: [precision@, recall@]
thresholds:
percentiles: [5.0, 10.0]
top_n: [5, 10]
-
metrics: [f1]
-
metrics: [fbeta@]
parameters:
-
beta: 0.75
-
beta: 1.25
training_metric_groups:
-
metrics: [accuracy]
subsets:
-
name: women
query: |
select distinct entity_id
from demographics
where d.gender = 'woman'
and demographic_date < '{as_of_date}'::date
# INDIVIDUAL IMPORTANCES
# How feature importances for individuals should be computed
# There are two variables here:
# methods: Refer to *how to compute* individual importances.
# Each entry in this list should represent a different method.
# Available methods are in the catwalk library's:
# `catwalk.individual_importance.CALCULATE_STRATEGIES` list
# Will default to 'uniform', or just the global importances.
#
# n_ranks: The number of top features per individual to compute importances for
# Will default to 5
#
# This entire section can be left blank,
# in which case the defaults will be used.
individual_importance:
methods: [] # empty list means don't calculate individual importances
# methods: ['uniform']
n_ranks: 5