Skip to content

Commit

Permalink
fix: nn_trainer, refactor model saving to ignore initial validation (#…
Browse files Browse the repository at this point in the history
…972)

See #972 for detailed discussion
  • Loading branch information
Sergey Mironov authored and yoptar committed Oct 3, 2019
1 parent a951492 commit e5b6184
Showing 1 changed file with 57 additions and 12 deletions.
69 changes: 57 additions & 12 deletions deeppavlov/core/trainers/nn_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ class NNTrainer(FitTrainer):
max_test_batches: maximum batches count for pipeline testing and evaluation, overrides ``log_on_k_batches``,
ignored if negative (default is ``-1``)
**kwargs: additional parameters whose names will be logged but otherwise ignored
Trainer saves the model if it sees a progress in scores. The full rules look like following:
- For the validation savepoint:
* 0-th validation (optional). Don't save model, establish a baseline.
* 1-th validation.
+ If we have a baseline, save the model if we see an improvement, don't save otherwise.
+ If we don't have a baseline, save the model.
* 2nd and later validations. Save the model if we see an improvement
- For the at-train-exit savepoint:
* Save the model if it happened before 1st validation (to capture early training results), don't save otherwise.
"""
def __init__(self, chainer_config: dict, *, batch_size: int = 1,
epochs: int = -1,
Expand All @@ -98,16 +111,21 @@ def __init__(self, chainer_config: dict, *, batch_size: int = 1,
self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params)

metric_optimization = metric_optimization.strip().lower()
self.score_best = None

def _improved(op):
return lambda score, baseline: False if baseline is None or score is None \
else op(score,baseline)

if metric_optimization == 'maximize':
self.best = float('-inf')
self.improved = lambda score: score > self.best
self.improved = _improved(lambda a, b: a > b)
elif metric_optimization == 'minimize':
self.best = float('inf')
self.improved = lambda score: score < self.best
self.improved = _improved(lambda a, b: a < b)
else:
raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize']))

self.validate_first = validate_first
self.validation_number = 0 if validate_first else 1
self.validation_patience = validation_patience
self.val_every_n_epochs = val_every_n_epochs
self.val_every_n_batches = val_every_n_batches
Expand All @@ -124,7 +142,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = 1,
self.patience = 0
self.last_result = {}
self.losses = []
self.start_time = None
self.start_time = None # type:Optional[float]

if self.tensorboard_log_dir is not None:
self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log'))
Expand All @@ -135,7 +153,12 @@ def save(self) -> None:
raise RuntimeError('Cannot save already finalized chainer')

self._chainer.save()
self._saved = True

def _is_initial_validation(self):
return self.validation_number == 0

def _is_first_validation(self):
return self.validation_number == 1

def _validate(self, iterator: DataLearningIterator,
tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
Expand All @@ -159,15 +182,32 @@ def _validate(self, iterator: DataLearningIterator,
self.tb_valid_writer.flush()

m_name, score = metrics[0]
if self.improved(score):

# Update the patience
if self.score_best is None:
self.patience = 0
log.info('New best {} of {}'.format(m_name, score))
self.best = score
else:
if self.improved(score, self.score_best):
self.patience = 0
else:
self.patience += 1

# Run the validation model-saving logic
if self._is_initial_validation():
log.info('Initial best {} of {}'.format(m_name, score))
self.score_best = score
elif self._is_first_validation() and self.score_best is None:
log.info('First best {} of {}'.format(m_name, score))
self.score_best = score
log.info('Saving model')
self.save()
elif self.improved(score, self.score_best):
log.info('Improved best {} of {}'.format(m_name, score))
self.score_best = score
log.info('Saving model')
self.save()
else:
self.patience += 1
log.info('Did not improve on the {} of {}'.format(m_name, self.best))
log.info('Did not improved on the {} of {}'.format(m_name, self.score_best))

report['impatience'] = self.patience
if self.validation_patience > 0:
Expand All @@ -176,6 +216,7 @@ def _validate(self, iterator: DataLearningIterator,
self._send_event(event_name='after_validation', data=report)
report = {'valid': report}
print(json.dumps(report, ensure_ascii=False))
self.validation_number += 1

def _log(self, iterator: DataLearningIterator,
tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
Expand Down Expand Up @@ -297,5 +338,9 @@ def train(self, iterator: DataLearningIterator) -> None:
else:
log.warn(f'Using {self.__class__.__name__} for a pipeline without batched training')

if not self._saved:
# Run the at-train-exit model-saving logic
if self.validation_number < 1:
log.info('Save model to capture early training results')
self.save()


0 comments on commit e5b6184

Please sign in to comment.