fix: nn_trainer, refactor model saving to ignore initial validation (#…

…972) See #972 for detailed discussion
deeppavlov · Oct 3, 2019 · e5b6184 · e5b6184
1 parent a951492
commit e5b6184
Showing 1 changed file with 57 additions and 12 deletions.
diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py
@@ -73,6 +73,19 @@ class NNTrainer(FitTrainer):
         max_test_batches: maximum batches count for pipeline testing and evaluation, overrides ``log_on_k_batches``,
             ignored if negative (default is ``-1``)
         **kwargs: additional parameters whose names will be logged but otherwise ignored
+
+
+    Trainer saves the model if it sees a progress in scores. The full rules look like following:
+
+    - For the validation savepoint:
+        * 0-th validation (optional). Don't save model, establish a baseline.
+        * 1-th validation.
+             + If we have a baseline, save the model if we see an improvement, don't save otherwise.
+             + If we don't have a baseline, save the model.
+        * 2nd and later validations. Save the model if we see an improvement
+    - For the at-train-exit savepoint:
+        * Save the model if it happened before 1st validation (to capture early training results), don't save otherwise.
+
     """
     def __init__(self, chainer_config: dict, *, batch_size: int = 1,
                  epochs: int = -1,
@@ -98,16 +111,21 @@ def __init__(self, chainer_config: dict, *, batch_size: int = 1,
             self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params)
 
         metric_optimization = metric_optimization.strip().lower()
+        self.score_best = None
+
+        def _improved(op):
+            return lambda score, baseline: False if baseline is None or score is None \
+                                                else op(score,baseline)
+
         if metric_optimization == 'maximize':
-            self.best = float('-inf')
-            self.improved = lambda score: score > self.best
+            self.improved = _improved(lambda a, b: a > b)
         elif metric_optimization == 'minimize':
-            self.best = float('inf')
-            self.improved = lambda score: score < self.best
+            self.improved = _improved(lambda a, b: a < b)
         else:
             raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize']))
 
         self.validate_first = validate_first
+        self.validation_number = 0 if validate_first else 1
         self.validation_patience = validation_patience
         self.val_every_n_epochs = val_every_n_epochs
         self.val_every_n_batches = val_every_n_batches
@@ -124,7 +142,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = 1,
         self.patience = 0
         self.last_result = {}
         self.losses = []
-        self.start_time = None
+        self.start_time = None  # type:Optional[float]
 
         if self.tensorboard_log_dir is not None:
             self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log'))
@@ -135,7 +153,12 @@ def save(self) -> None:
             raise RuntimeError('Cannot save already finalized chainer')
 
         self._chainer.save()
-        self._saved = True
+
+    def _is_initial_validation(self):
+        return self.validation_number == 0
+
+    def _is_first_validation(self):
+        return self.validation_number == 1
 
     def _validate(self, iterator: DataLearningIterator,
                   tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
@@ -159,15 +182,32 @@ def _validate(self, iterator: DataLearningIterator,
             self.tb_valid_writer.flush()
 
         m_name, score = metrics[0]
-        if self.improved(score):
+
+        # Update the patience
+        if self.score_best is None:
             self.patience = 0
-            log.info('New best {} of {}'.format(m_name, score))
-            self.best = score
+        else:
+            if self.improved(score, self.score_best):
+                self.patience = 0
+            else:
+                self.patience += 1
+
+        # Run the validation model-saving logic
+        if self._is_initial_validation():
+            log.info('Initial best {} of {}'.format(m_name, score))
+            self.score_best = score
+        elif self._is_first_validation() and self.score_best is None:
+            log.info('First best {} of {}'.format(m_name, score))
+            self.score_best = score
+            log.info('Saving model')
+            self.save()
+        elif self.improved(score, self.score_best):
+            log.info('Improved best {} of {}'.format(m_name, score))
+            self.score_best = score
             log.info('Saving model')
             self.save()
         else:
-            self.patience += 1
-            log.info('Did not improve on the {} of {}'.format(m_name, self.best))
+            log.info('Did not improved on the {} of {}'.format(m_name, self.score_best))
 
         report['impatience'] = self.patience
         if self.validation_patience > 0:
@@ -176,6 +216,7 @@ def _validate(self, iterator: DataLearningIterator,
         self._send_event(event_name='after_validation', data=report)
         report = {'valid': report}
         print(json.dumps(report, ensure_ascii=False))
+        self.validation_number += 1
 
     def _log(self, iterator: DataLearningIterator,
              tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None:
@@ -297,5 +338,9 @@ def train(self, iterator: DataLearningIterator) -> None:
         else:
             log.warn(f'Using {self.__class__.__name__} for a pipeline without batched training')
 
-        if not self._saved:
+        # Run the at-train-exit model-saving logic
+        if self.validation_number < 1:
+            log.info('Save model to capture early training results')
             self.save()
+
+