-
Notifications
You must be signed in to change notification settings - Fork 0
/
_base_model.py
executable file
·281 lines (236 loc) · 11.2 KB
/
_base_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import abc
import optuna
import joblib
import pandas as pd
import numpy as np
class BaseModel(abc.ABC):
"""
BaseModel parent class for all models that can be used within the framework.
Every model must be based on :obj:`~ForeTiS.model._base_model.BaseModel` directly or
BaseModel's child classes, e.g. :obj:`~ForeTiS.model._sklearn_model.SklearnModel` or
:obj:`~ForeTiS.model._torch_model.TorchModel`
** Attributes **
* Instance attributes *
- optuna_trial (*optuna.trial.Trial*): trial of optuna for optimization
- datasets (*list<pd.DataFrame>*): all datasets that are available
- n_outputs (*int*): number of outputs of the prediction model
- all_hyperparams (*dict*): dictionary with all hyperparameters with related info that can be tuned (structure see :obj:`~ForeTiS.model._base_model.BaseModel.define_hyperparams_to_tune`)
- dataset (*pd.DataFrame*): the dataset for this optimization trial
- model: model object
- target_column: the target column for the prediction
- pca_transform: whether conducting pca transformation should be a hyperparameter to optimize or not
- featureset: the recent featureset
:param optuna_trial: Trial of optuna for optimization
:param datasets: all datasets that are available
:param featureset_name: the name of the recent feature set
:param target_column: the target column for the prediction
:param pca_transform: whether conducting pca transformation should be a hyperparameter to optimize or not
:param optimize_featureset: whether the feature set should be optimized or not
"""
# Constructor super class #
def __init__(self, optuna_trial: optuna.trial.Trial, datasets: list, featureset_name: str, pca_transform: bool,
target_column: str, optimize_featureset: bool):
self.optuna_trial = optuna_trial
self.datasets = datasets
self.target_column = target_column
self.n_outputs = 1
if not hasattr(self, 'all_hyperparams'):
self.all_hyperparams = self.define_hyperparams_to_tune()
else:
# update in case common hyperparams are already defined
self.all_hyperparams.update(self.define_hyperparams_to_tune())
if pca_transform:
self.all_hyperparams.update(self.pca_transform())
self.pca_transform = self.suggest_hyperparam_to_optuna('pca')
del self.all_hyperparams['pca']
else:
self.pca_transform = False
if optimize_featureset:
self.all_hyperparams.update(self.featureset_hyperparam())
featureset_name = self.suggest_hyperparam_to_optuna('featureset')
del self.all_hyperparams['featureset']
for featureset in datasets.featuresets:
if featureset.name == featureset_name:
self.featureset = featureset
break
else:
for featureset in datasets.featuresets:
if featureset.name == featureset_name:
self.featureset = featureset
break
self.model = self.define_model()
# Methods required by each child class #
@abc.abstractmethod
def define_model(self):
"""
Method that defines the model that needs to be optimized.
Hyperparams to tune have to be specified in all_hyperparams and suggested via suggest_hyperparam_to_optuna().
The hyperparameters have to be included directly in the model definiton to be optimized.
e.g. if you want to optimize the number of layers, do something like
.. code-block:: python
n_layers = self.suggest_hyperparam_to_optuna('n_layers') # same name in define_hyperparams_to_tune()
for layer in n_layers:
do something
Then the number of layers will be optimized by optuna.
"""
@abc.abstractmethod
def define_hyperparams_to_tune(self) -> dict:
"""
Method that defines the hyperparameters that should be tuned during optimization and their ranges.
Required format is a dictionary with:
.. code-block:: python
{
'name_hyperparam_1':
{
# MANDATORY ITEMS
'datatype': 'float' | 'int' | 'categorical',
FOR DATATYPE 'categorical':
'list_of_values': [] # List of all possible values
FOR DATATYPE ['float', 'int']:
'lower_bound': value_lower_bound,
'upper_bound': value_upper_bound,
# OPTIONAL ITEMS (only for ['float', 'int']):
'log': True | False # sample value from log domain or not
'step': step_size # step of discretization.
# Caution: cannot be combined with log=True
# - in case of 'float' in general and
# - for step!=1 in case of 'int'
},
'name_hyperparam_2':
{
...
},
...
'name_hyperparam_k':
{
...
}
}
If you want to use a similar hyperparameter multiple times (e.g. Dropout after several layers),
you only need to specify the hyperparameter once. Individual parameters for every suggestion will be created.
"""
@abc.abstractmethod
def retrain(self, retrain: pd.DataFrame):
"""
Method that runs the retraining of the model
:param retrain: data for retraining
"""
@abc.abstractmethod
def update(self, update: pd.DataFrame, period: int):
"""
Method that runs the updating of the model
:param update: data for updating
"""
@abc.abstractmethod
def predict(self, X_in: pd.DataFrame) -> np.array:
"""
Method that predicts target values based on the input X_in
:param X_in: feature matrix as input
:return: numpy array with the predicted values
"""
@abc.abstractmethod
def train_val_loop(self, train: pd.DataFrame, val: pd.DataFrame) -> np.array:
"""
Method that runs the whole training and validation loop
:param train: data for the training
:param val: data for validation
:return: predictions on validation set
"""
### General methods ###
def suggest_hyperparam_to_optuna(self, hyperparam_name: str):
"""
Suggest a hyperparameter of hyperparam_dict to the optuna trial to optimize it.
If you want to add a parameter to your model / in your pipeline to be optimized, you need to call this method
:param hyperparam_name: name of the hyperparameter to be tuned (see :obj:`~ForeTiS.model._base_model.BaseModel.define_hyperparams_to_tune`)
:return: suggested value
"""
# Get specification of the hyperparameter
if hyperparam_name in self.all_hyperparams:
spec = self.all_hyperparams[hyperparam_name]
else:
raise Exception(hyperparam_name + ' not found in all_hyperparams dictionary.')
# Check if the hyperparameter already exists in the trial and needs a suffix
# (e.g. same dropout specification for multiple layers that should be optimized individually)
if hyperparam_name in self.optuna_trial.params:
counter = 1
while True:
current_name = hyperparam_name + '_' + str(counter)
if current_name not in self.optuna_trial.params:
optuna_param_name = current_name
break
counter += 1
else:
optuna_param_name = hyperparam_name
# Read dict with specification for the hyperparamater and suggest it to the trial
if spec['datatype'] == 'categorical':
if 'list_of_values' not in spec:
raise Exception(
'"list of values" for ' + hyperparam_name + ' not in hyperparams_dict. '
'Check define_hyperparams_to_tune() of the model.'
)
suggested_value = \
self.optuna_trial.suggest_categorical(name=optuna_param_name, choices=spec['list_of_values'])
elif spec['datatype'] in ['float', 'int']:
if 'step' in spec:
step = spec['step']
else:
step = None if spec['datatype'] == 'float' else 1
log = spec['log'] if 'log' in spec else False
if 'lower_bound' not in spec or 'upper_bound' not in spec:
raise Exception(
'"lower_bound" or "upper_bound" for ' + hyperparam_name + ' not in all_hyperparams. '
'Check define_hyperparams_to_tune() of the model.'
)
if spec['datatype'] == 'int':
suggested_value = self.optuna_trial.suggest_int(
name=optuna_param_name, low=spec['lower_bound'], high=spec['upper_bound'], step=step, log=log
)
else:
suggested_value = self.optuna_trial.suggest_float(
name=optuna_param_name, low=spec['lower_bound'], high=spec['upper_bound'], step=step, log=log
)
else:
raise Exception(
spec['datatype'] + ' is not a valid parameter. Check define_hyperparams_to_tune() of the model.'
)
return suggested_value
def suggest_all_hyperparams_to_optuna(self) -> dict:
"""
Some models accept a dictionary with the model parameters.
This method suggests all hyperparameters in all_hyperparams and gives back a dictionary containing them.
:return: dictionary with suggested hyperparameters
"""
for param_name in self.all_hyperparams.keys():
_ = self.suggest_hyperparam_to_optuna(param_name)
return self.optuna_trial.params
def featureset_hyperparam(self):
"""
Method that defines the feature set hyperparameter that should be tuned during optimization and its ranges.
"""
featuresets_names = []
for featureset in self.datasets.featuresets:
featuresets_names.append(featureset.name)
return {
'featureset': {
'datatype': 'categorical',
'list_of_values': featuresets_names
}
}
def pca_transform(self):
"""
Method that defines the pca transform hyperparameter that should be tuned during optimization and its ranges.
"""
return {
'pca': {
'datatype': 'categorical',
'list_of_values': [True, False]
}
}
def save_model(self, path: str, filename: str):
"""
Persist the whole model object on a hard drive
(can be loaded with :obj:`~ForeTiS.model._model_functions.load_model`)
:param path: path where the model will be saved
:param filename: filename of the model
"""
joblib.dump(self, path.joinpath(filename), compress=3)